From 83a1d3507f16f2c8c35fb5ce6939c662ccafdede Mon Sep 17 00:00:00 2001 From: bamader <49412165+bamader@users.noreply.github.com> Date: Thu, 13 Feb 2025 13:24:25 -0500 Subject: [PATCH 1/2] Remove the remnants of dibbs basic (#214) ## Description This PR removes the dibbs basic algorithm from the RecordLinker service, updating the documentation and unit tests to reflect this change. There's a lot of alterations, but tests are now stable and everything looks good across the board. Would probably prefer waiting to merge this until both other engineers get eyes on to make sure I didn't miss anything and to catch any outstanding nuances in test cases. ## Related Issues closes #205 --- .github/workflows/check_smoke_tests.yml | 2 +- README.md | 8 + docs/site/reference.md | 20 -- .../assets/initial_algorithms.json | 77 +----- src/recordlinker/linking/matchers.py | 77 ------ tests/algorithm/README.md | 2 +- .../algorithm_configuration.json | 65 +++-- .../possible_match_basic_patient_bundle.json | 164 ------------ ...ossible_match_default_patient_bundle.json} | 0 tests/unit/conftest.py | 12 +- tests/unit/database/test_algorithm_service.py | 59 ++--- tests/unit/linking/test_link.py | 132 ++++------ tests/unit/linking/test_matchers.py | 93 ------- tests/unit/models/test_algorithm.py | 24 +- tests/unit/routes/test_algorithm_router.py | 91 +++---- tests/unit/routes/test_link_router.py | 243 ++---------------- tests/unit/routes/test_seed_router.py | 4 +- tests/unit/schemas/test_algorithm.py | 41 ++- tests/unit/utils/test_functools.py | 36 +-- 19 files changed, 250 insertions(+), 900 deletions(-) delete mode 100644 tests/unit/assets/possible_match_basic_patient_bundle.json rename tests/unit/assets/{possible_match_enhanced_patient_bundle.json => possible_match_default_patient_bundle.json} (100%) diff --git a/.github/workflows/check_smoke_tests.yml b/.github/workflows/check_smoke_tests.yml index a384e112..f8ad760f 100644 --- a/.github/workflows/check_smoke_tests.yml +++ b/.github/workflows/check_smoke_tests.yml @@ -81,7 +81,7 @@ jobs: # Run smoke tests and print the response JSON_BODY_1='{"record": {"birth_date": "2053-11-07", "sex": "M", "identifiers":[{"value": "123456789", "type": "MR"}], "name":[{"family":"Shepard", "given":["John"]}]}}' - JSON_BODY_2='{"algorithm": "dibbs-enhanced", "record": {"birth_date": "2000-12-06", "sex": "M", "identifiers":[{"value": "9876543210", "type": "MR"}], "name":[{"family":"Smith", "given":["William"]}]}}' + JSON_BODY_2='{"algorithm": "dibbs-default", "record": {"birth_date": "2000-12-06", "sex": "M", "identifiers":[{"value": "9876543210", "type": "MR"}], "name":[{"family":"Smith", "given":["William"]}]}}' #basic tests RESPONSE_1=$(curl -s -X POST http://localhost:8080/link \ diff --git a/README.md b/README.md index 1df1b78c..64e842fd 100644 --- a/README.md +++ b/README.md @@ -39,6 +39,14 @@ To run the API locally, use the following command: The API will be available at `http://localhost:8000`. Visit `http://localhost:8000/redoc` to view the API documentation. +## Testing + +The RecordLinker system comes with a number of built-in tests spread across several different types. Some of these tests are run automatically (e.g. by Github), while others must be manually executed by a developer. + +- `tests/unit`: These comprise basic unit (and in some cases integration) tests providing code coverage to RecordLinker. These tests demonstrate the functionality of different parts of the code base under different logical conditions and with different inputs and outputs. They are automataically executed by a Github Actions workflow as part of a PR. +- `tests/algorithm`: This is a set of scripts developed to test an algorithm configuration with a known set of particular edge cases. In response to frequent questions of how the DIBBs algorithm handles case X, this mini-project was created to help answer those questions by giving developers some persistent evaluation tools. These tests are _not_ automated, and developers will need to go through the steps in the README in the relevant directory in order to run them. +- `tests/performance`: Another set of scripts developed to see how fast the API can process linkage requests using synthetic data. This is useful for verifying refactors are still performant and helping developers identify bottlenecks along the way. These tests are _not_ automated, and developers need to go through the steps in the README of the relevant directory in order to run them. + ### Running unit tests To run all the unit tests, use the following command: diff --git a/docs/site/reference.md b/docs/site/reference.md index 66d9462e..731dcc5b 100644 --- a/docs/site/reference.md +++ b/docs/site/reference.md @@ -127,11 +127,6 @@ patient data and used during query retrieval. The following blocking key types a These are the functions that can be used to evaluate the matching results as a collection, thus determining it the incoming payload is a match or not to an existing Patient record. -`func:recordlinker.linking.matchers.rule_match` - -: Determines whether a given set of feature comparisons represent a 'perfect' match - (i.e. all features that were compared match in whatever criteria was specified). - `func:recordlinker.linking.matchers.rule_probabilistic_match` : Determines whether a given set of feature comparisons matches enough to be the @@ -154,21 +149,6 @@ that to an existing Patient with the ADDRESS of [{"address": ["123 Main Street"], "city": "Springfield", "state": "IL"}, {"address": ["456 Elm St"], "state": "IL"}]. In that case we'd want to evaluate "123 Main St" against both "123 Main Street" and "456 Elm St". -`func:recordlinker.linking.matchers.compare_match_any` - -: Determines if any of the features are a direct match. - -`func:recordlinker.linking.matchers.compare_match_all` - -: Determines if all of the features are a direct match. - -`func:recordlinker.linking.matchers.compare_fuzzy_match` - -: Determines if the features are a fuzzy match based on a string comparison. - JaroWinkler, Levenshtein and Damerau-Levenshtein are supported, with JaroWinkler as the default. - Use the `kwargs` parameter to specify the desired algorithm and thresholds. - Example: `{"kwargs": {"similarity_measure": "levenshtein", "thresholds": {"FIRST_NAME": 0.8}}}` - `func:recordlinker.linking.matchers.compare_probabilistic_exact_match` : Determines if a Feature Field has the same value in two different patient records. If the two fields agree diff --git a/src/recordlinker/assets/initial_algorithms.json b/src/recordlinker/assets/initial_algorithms.json index 1d5addd8..595b0802 100644 --- a/src/recordlinker/assets/initial_algorithms.json +++ b/src/recordlinker/assets/initial_algorithms.json @@ -1,76 +1,13 @@ [ { - "label": "dibbs-basic", - "description": "The DIBBs Default Algorithm. Based on field experimentation and statistical analysis, this deterministic two-pass algorithm combines geographical and personal information to maximize linkage quality while minimizing false positives", + "label": "dibbs-default", + "description": "The core DIBBs Log-Odds Algorithm. This default, recommended algorithm uses statistical correction to adjust the links between incoming records and previously processed patients (it does so by taking advantage of the fact that some fields are more informative than others—e.g., two records matching on MRN is stronger evidence that they should be linked than if the records matched on zip code). It can be used if additional granularity in matching links is desired. However, while the DIBBs Log-Odds Algorithm can create higher-quality links, it is dependent on statistical updating and pre-calculated population analysis, which requires some work on the part of the user. For those cases where additional precision or stronger matching criteria are required, the Log-Odds algorithm is detailed below.", "is_default": true, "include_multiple_matches": true, - "belongingness_ratio": [0.75, 0.9], - "passes": [ - { - "blocking_keys": [ - "BIRTHDATE", - "IDENTIFIER", - "SEX" - ], - "evaluators": [ - { - "feature": "FIRST_NAME", - "func": "func:recordlinker.linking.matchers.compare_fuzzy_match" - }, - { - "feature": "LAST_NAME", - "func": "func:recordlinker.linking.matchers.compare_match_all" - } - ], - "rule": "func:recordlinker.linking.matchers.rule_match", - "kwargs": { - "thresholds": { - "FIRST_NAME": 0.9, - "LAST_NAME": 0.9, - "BIRTHDATE": 0.95, - "ADDRESS": 0.9, - "CITY": 0.92, - "ZIP": 0.95 - } - } - }, - { - "blocking_keys": [ - "ZIP", - "FIRST_NAME", - "LAST_NAME", - "SEX" - ], - "evaluators": [ - { - "feature": "ADDRESS", - "func": "func:recordlinker.linking.matchers.compare_fuzzy_match" - }, - { - "feature": "BIRTHDATE", - "func": "func:recordlinker.linking.matchers.compare_match_all" - } - ], - "rule": "func:recordlinker.linking.matchers.rule_match", - "kwargs": { - "thresholds": { - "FIRST_NAME": 0.9, - "LAST_NAME": 0.9, - "BIRTHDATE": 0.95, - "ADDRESS": 0.9, - "CITY": 0.92, - "ZIP": 0.95 - } - } - } - ] - }, - { - "label": "dibbs-enhanced", - "description": "The DIBBs Log-Odds Algorithm. This optional algorithm uses statistical correction to adjust the links between incoming records and previously processed patients (it does so by taking advantage of the fact that some fields are more informative than others—e.g., two records matching on MRN is stronger evidence that they should be linked than if the records matched on zip code). It can be used if additional granularity in matching links is desired. However, while the DIBBs Log-Odds Algorithm can create higher-quality links, it is dependent on statistical updating and pre-calculated population analysis, which requires some work on the part of the user. For those cases where additional precision or stronger matching criteria are required, the Log-Odds algorithm is detailed below.", - "is_default": false, - "include_multiple_matches": true, - "belongingness_ratio": [0.75, 0.9], + "belongingness_ratio": [ + 0.75, + 0.9 + ], "passes": [ { "blocking_keys": [ @@ -157,4 +94,4 @@ } ] } -] +] \ No newline at end of file diff --git a/src/recordlinker/linking/matchers.py b/src/recordlinker/linking/matchers.py index 5daff2e7..946b98cc 100644 --- a/src/recordlinker/linking/matchers.py +++ b/src/recordlinker/linking/matchers.py @@ -30,7 +30,6 @@ class RuleFunc(enum.Enum): the algorithm. """ - RULE_MATCH = "func:recordlinker.linking.matchers.rule_match" RULE_PROBABILISTIC_MATCH = "func:recordlinker.linking.matchers.rule_probabilistic_match" @@ -44,9 +43,6 @@ class FeatureFunc(enum.Enum): matching, based on the configuration of the algorithm. """ - COMPARE_MATCH_ANY = "func:recordlinker.linking.matchers.compare_match_any" - COMPARE_MATCH_ALL = "func:recordlinker.linking.matchers.compare_match_all" - COMPARE_FUZZY_MATCH = "func:recordlinker.linking.matchers.compare_fuzzy_match" COMPARE_PROBABILISTIC_EXACT_MATCH = ( "func:recordlinker.linking.matchers.compare_probabilistic_exact_match" ) @@ -102,19 +98,6 @@ def _get_fuzzy_params(col: str, **kwargs) -> tuple[SIMILARITY_MEASURES, float]: return (similarity_measure, threshold) -def rule_match(feature_comparisons: list[float], **kwargs: typing.Any) -> bool: - """ - Determines whether a given set of feature comparisons represent a - 'perfect' match (i.e. whether all features that were compared match - in whatever criteria was specified for them). - - :param feature_comparisons: A list of 1s and 0s, one for each feature - that was compared during the match algorithm. - :return: The evaluation of whether the given features all match. - """ - return sum(feature_comparisons) == len(feature_comparisons) - - def rule_probabilistic_match(feature_comparisons: list[float], **kwargs: typing.Any) -> bool: """ Determines whether a given set of feature comparisons matches enough @@ -132,66 +115,6 @@ def rule_probabilistic_match(feature_comparisons: list[float], **kwargs: typing. return sum(feature_comparisons) >= float(threshold) -def compare_match_any( - record: PIIRecord, patient: Patient, key: Feature, **kwargs: typing.Any -) -> float: - """ - ... - - :param record: The incoming record to evaluate. - :param patient: The patient record to compare against. - :param key: The name of the column being evaluated (e.g. "city"). - :return: A float indicating whether any of the features are an exact match. - """ - rec_values = set(record.feature_iter(key)) - if not rec_values: - return 0 - pat_values = set(patient.record.feature_iter(key)) - return float(bool(rec_values & pat_values)) - - -def compare_match_all( - record: PIIRecord, patient: Patient, key: Feature, **kwargs: typing.Any -) -> float: - """ - ... - - :param record: The incoming record to evaluate. - :param patient: The patient record to compare against. - :param key: The name of the column being evaluated (e.g. "city"). - :return: A float indicating whether all of the features are an exact match. - """ - rec_values = set(record.feature_iter(key)) - if not rec_values: - return 0 - pat_values = set(patient.record.feature_iter(key)) - return float(rec_values == pat_values) - - -def compare_fuzzy_match( - record: PIIRecord, patient: Patient, key: Feature, **kwargs: typing.Any -) -> float: - """ - ... - - :param record: The incoming record to evaluate. - :param patient: The patient record to compare against. - :param key: The name of the column being evaluated (e.g. "city"). - :param **kwargs: Optionally, a dictionary including specifications for - the string comparison metric to use, as well as the cutoff score - beyond which to classify the strings as a partial match. - :return: A float indicating whether the features are a fuzzy match. - """ - similarity_measure, threshold = _get_fuzzy_params(str(key.attribute), **kwargs) - comp_func = getattr(rapidfuzz.distance, similarity_measure).normalized_similarity - for x in record.feature_iter(key): - for y in patient.record.feature_iter(key): - score = comp_func(x, y) - if score >= threshold: - return 1 - return 0 - - def compare_probabilistic_exact_match( record: PIIRecord, patient: Patient, key: Feature, **kwargs: typing.Any ) -> float: diff --git a/tests/algorithm/README.md b/tests/algorithm/README.md index 861a1ccf..2b36418d 100644 --- a/tests/algorithm/README.md +++ b/tests/algorithm/README.md @@ -92,7 +92,7 @@ The following environment variables can be tuned in the `algo-test.env` file: - `SEED_FILE`: The file containing person data to seed the mpi with - `TEST_FILE`: The file containing patient data to test the algorithm with - `ALGORITHM_CONFIGURATION`: The file containing the algorithm configuration json -- `ALGORITHM_NAME`: The name of the algorithm to use (either the name of your `ALGORITHM_CONFIGURATION` or can be the built in `dibbs-basic` or `dibbs-enhanced` algorithms) +- `ALGORITHM_NAME`: The name of the algorithm to use (either the name of your `ALGORITHM_CONFIGURATION` or can be the built in `dibbs-default` algorithms) ## Cleanup diff --git a/tests/algorithm/configurations/algorithm_configuration.json b/tests/algorithm/configurations/algorithm_configuration.json index 6abde868..c68725d7 100644 --- a/tests/algorithm/configurations/algorithm_configuration.json +++ b/tests/algorithm/configurations/algorithm_configuration.json @@ -3,23 +3,26 @@ "description": "test algorithm configuration", "is_default": false, "include_multiple_matches": true, - "belongingness_ratio": [0.75, 0.9], + "belongingness_ratio": [ + 0.75, + 0.9 + ], "passes": [ { "blocking_keys": [ "BIRTHDATE" ], "evaluators": [ - { - "feature": "FIRST_NAME", - "func": "func:recordlinker.linking.matchers.feature_match_fuzzy_string" - }, - { - "feature": "LAST_NAME", - "func": "func:recordlinker.linking.matchers.feature_match_exact" - } + { + "feature": "FIRST_NAME", + "func": "func:recordlinker.linking.matchers.compare_probabilistic_fuzzy_match" + }, + { + "feature": "LAST_NAME", + "func": "func:recordlinker.linking.matchers.compare_probabilistic_fuzzy_match" + } ], - "rule": "func:recordlinker.linking.matchers.eval_perfect_match", + "rule": "func:recordlinker.linking.matchers.rule_probabilistic_match", "cluster_ratio": 0.9, "kwargs": { "thresholds": { @@ -29,6 +32,18 @@ "ADDRESS": 0.9, "CITY": 0.92, "ZIP": 0.95 + }, + "true_match_threshold": 12.2, + "log_odds": { + "ADDRESS": 8.438284928858774, + "BIRTHDATE": 10.126641103800338, + "CITY": 2.438553006137189, + "FIRST_NAME": 6.849475906891162, + "LAST_NAME": 6.350720397426025, + "IDENTIFIER:MR": 0.3051262572525359, + "SEX": 0.7510419059643679, + "STATE": 0.022376768992488694, + "ZIP": 4.975031471124867 } } }, @@ -40,16 +55,16 @@ "SEX" ], "evaluators": [ - { - "feature": "ADDRESS", - "func": "func:recordlinker.linking.matchers.feature_match_fuzzy_string" - }, - { - "feature": "BIRTHDATE", - "func": "func:recordlinker.linking.matchers.feature_match_exact" - } + { + "feature": "ADDRESS", + "func": "func:recordlinker.linking.matchers.compare_probabilistic_fuzzy_match" + }, + { + "feature": "BIRTHDATE", + "func": "func:recordlinker.linking.matchers.compare_probabilistic_fuzzy_match" + } ], - "rule": "func:recordlinker.linking.matchers.eval_perfect_match", + "rule": "func:recordlinker.linking.matchers.rule_probabilistic_match", "cluster_ratio": 0.9, "kwargs": { "thresholds": { @@ -59,6 +74,18 @@ "ADDRESS": 0.9, "CITY": 0.92, "ZIP": 0.95 + }, + "true_match_threshold": 17.0, + "log_odds": { + "ADDRESS": 8.438284928858774, + "BIRTHDATE": 10.126641103800338, + "CITY": 2.438553006137189, + "FIRST_NAME": 6.849475906891162, + "LAST_NAME": 6.350720397426025, + "IDENTIFIER:MR": 0.3051262572525359, + "SEX": 0.7510419059643679, + "STATE": 0.022376768992488694, + "ZIP": 4.975031471124867 } } } diff --git a/tests/unit/assets/possible_match_basic_patient_bundle.json b/tests/unit/assets/possible_match_basic_patient_bundle.json deleted file mode 100644 index 7d5d70d3..00000000 --- a/tests/unit/assets/possible_match_basic_patient_bundle.json +++ /dev/null @@ -1,164 +0,0 @@ -{ - "resourceType": "Bundle", - "identifier": { - "value": "a very contrived FHIR bundle" - }, - "entry": [ - { - "resource": { - "resourceType": "Patient", - "id": "f6a16ff7-4a31-11eb-be7b-8344edc8f36b", - "identifier": [ - { - "value": "123456789", - "type": { - "coding": [ - { - "code": "MR", - "system": "http://terminology.hl7.org/CodeSystem/v2-0203", - "display": "Medical record number" - } - ] - } - } - ], - "name": [ - { - "family": "Smith", - "given": [ - "John" - ], - "use": "official" - } - ], - "birthDate": "1980-01-01", - "gender": "male", - "address": [ - { - "line": [ - "1234 Silversun Strip" - ], - "buildingNumber": "1234", - "city": "Boston", - "state": "Massachusetts", - "postalCode": "99999", - "use": "home" - } - ], - "telecom": [ - { - "use": "home", - "system": "phone", - "value": "123-456-7890" - } - ] - } - }, - { - "resource": { - "resourceType": "Patient", - "id": "2fdd0b8b-4a70-11eb-99fd-ad786a821574", - "identifier": [ - { - "value": "123456789", - "type": { - "coding": [ - { - "code": "MR", - "system": "http://terminology.hl7.org/CodeSystem/v2-0203", - "display": "Medical record number" - } - ] - } - } - ], - "name": [ - { - "family": "Smith", - "given": [ - "Johnny" - ], - "use": "official" - } - ], - "birthDate": "1980-01-01", - "gender": "male", - "address": [ - { - "line": [ - "1234 Silversun Strip" - ], - "buildingNumber": "1234", - "city": "Boston", - "state": "Massachusetts", - "postalCode": "99999", - "use": "home" - } - ], - "telecom": [ - { - "use": "home", - "system": "phone", - "value": "123-456-7890" - } - ] - } - }, - { - "resource": { - "resourceType": "Patient", - "id": "2fdd0b8b-4a70-11eb-99fd-ad786a821574", - "identifier": [ - { - "value": "123456789", - "type": { - "coding": [ - { - "code": "MR", - "system": "http://terminology.hl7.org/CodeSystem/v2-0203", - "display": "Medical record number" - } - ] - } - } - ], - "name": [ - { - "family": "Smith", - "given": [ - "Jon" - ], - "use": "official" - } - ], - "birthDate": "1980-01-01", - "gender": "male", - "address": [ - { - "line": [ - "Bay 16", - "Ward Sector 24" - ], - "city": "Brooklyn", - "state": "New York", - "postalCode": "54321", - "use": "home" - } - ], - "telecom": [ - { - "use": "home", - "system": "phone", - "value": "123-456-7890" - } - ] - } - }, - { - "request": { - "method": "GET", - "url": "testing for entry with no resource" - } - } - ] -} diff --git a/tests/unit/assets/possible_match_enhanced_patient_bundle.json b/tests/unit/assets/possible_match_default_patient_bundle.json similarity index 100% rename from tests/unit/assets/possible_match_enhanced_patient_bundle.json rename to tests/unit/assets/possible_match_default_patient_bundle.json diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index 2881a3e1..a79f3fd9 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -73,17 +73,9 @@ def clean_test_database(): @functools.lru_cache @pytest.fixture -def basic_algorithm(): +def default_algorithm(): for algo in utils.read_json("assets/initial_algorithms.json"): - if algo["label"] == "dibbs-basic": - return models.Algorithm.from_dict(**algo) - - -@functools.lru_cache -@pytest.fixture -def enhanced_algorithm(): - for algo in utils.read_json("assets/initial_algorithms.json"): - if algo["label"] == "dibbs-enhanced": + if algo["label"] == "dibbs-default": return models.Algorithm.from_dict(**algo) diff --git a/tests/unit/database/test_algorithm_service.py b/tests/unit/database/test_algorithm_service.py index 49af059f..aac87281 100644 --- a/tests/unit/database/test_algorithm_service.py +++ b/tests/unit/database/test_algorithm_service.py @@ -15,34 +15,32 @@ def test_list_algorithms(session): - algo1 = models.Algorithm(label="basic", is_default=True, description="First algorithm") - algo2 = models.Algorithm(label="enhanced", description="Second algorithm") + algo1 = models.Algorithm(label="default", description="First algorithm") + algo2 = models.Algorithm(label="user", description="User uploaded algorithm") session.add(algo1) session.add(algo2) session.commit() algorithms = algorithm_service.list_algorithms(session) assert len(algorithms) == 2 - assert [a.label for a in algorithms] == ["basic", "enhanced"] + assert [a.label for a in algorithms] == ["default", "user"] def test_default_algorithm(session): assert algorithm_service.default_algorithm(session) is None - algo1 = models.Algorithm(label="basic", is_default=False, description="First algorithm") - algo2 = models.Algorithm(label="enhanced", is_default=True, description="Second algorithm") + algo1 = models.Algorithm(label="default", is_default=True, description="First algorithm") session.add(algo1) - session.add(algo2) session.commit() default = algorithm_service.default_algorithm(session) - assert default == algo2 + assert default == algo1 class TestGetAlgorithm: def test_get_algorithm_match(self, session): - testLabel = "dibss_basic" - algo1 = models.Algorithm(label=testLabel, is_default=True, description="First algorithm") + testLabel = "dibbs-default" + algo1 = models.Algorithm(label="dibbs-default", description="First algorithm") session.add(algo1) session.commit() @@ -51,9 +49,7 @@ def test_get_algorithm_match(self, session): def test_get_algorithm_no_match(self, session): # inserting the default algorithm - algo1 = models.Algorithm( - label="dibss_basic", is_default=True, description="First algorithm" - ) + algo1 = models.Algorithm(label="default", description="First algorithm") session.add(algo1) session.commit() @@ -64,7 +60,7 @@ def test_get_algorithm_no_match(self, session): class TestLoadAlgorithm: def test_load_algorithm_created(self, session): data = schemas.Algorithm( - label="dibss-basic", + label="dibbs-test", description="First algorithm", belongingness_ratio=(0.75, 0.8), passes=[ @@ -73,31 +69,32 @@ def test_load_algorithm_created(self, session): evaluators=[ { "feature": "ZIP", - "func": "func:recordlinker.linking.matchers.compare_match_any", + "func": "func:recordlinker.linking.matchers.compare_probabilistic_fuzzy_match", } ], - rule="func:recordlinker.linking.matchers.rule_match", + rule="func:recordlinker.linking.matchers.rule_probabilistic_match", ) ], + ) obj, created = algorithm_service.load_algorithm(session, data) session.flush() assert created is True assert obj.id == 1 - assert obj.label == "dibss-basic" + assert obj.label == "dibbs-test" assert obj.description == "First algorithm" assert obj.belongingness_ratio == (0.75, 0.8) assert len(obj.passes) == 1 assert obj.passes[0].algorithm_id == 1 assert obj.passes[0].blocking_keys == ["FIRST_NAME"] assert obj.passes[0].evaluators == [ - {"feature": "ZIP", "func": "func:recordlinker.linking.matchers.compare_match_any"} + {"feature": "ZIP", "func": "func:recordlinker.linking.matchers.compare_probabilistic_fuzzy_match"} ] - assert obj.passes[0].rule == "func:recordlinker.linking.matchers.rule_match" + assert obj.passes[0].rule == "func:recordlinker.linking.matchers.rule_probabilistic_match" def test_load_algorithm_updated(self, session): data = schemas.Algorithm( - label="dibss-basic", + label="dibbs-test", description="First algorithm", belongingness_ratio=(0.75, 0.8), passes=[ @@ -106,10 +103,10 @@ def test_load_algorithm_updated(self, session): evaluators=[ { "feature": "ZIP", - "func": "func:recordlinker.linking.matchers.compare_match_any", + "func": "func:recordlinker.linking.matchers.compare_probabilistic_fuzzy_match", } ], - rule="func:recordlinker.linking.matchers.rule_match", + rule="func:recordlinker.linking.matchers.rule_probabilistic_match", ) ], ) @@ -121,30 +118,30 @@ def test_load_algorithm_updated(self, session): session.flush() assert created is False assert obj.id == 1 - assert obj.label == "dibss-basic" + assert obj.label == "dibbs-test" assert obj.description == "Updated description" assert obj.belongingness_ratio == (0.75, 0.8) assert len(obj.passes) == 1 assert obj.passes[0].algorithm_id == 1 assert obj.passes[0].blocking_keys == ["LAST_NAME"] assert obj.passes[0].evaluators == [ - {"feature": "ZIP", "func": "func:recordlinker.linking.matchers.compare_match_any"} + {"feature": "ZIP", "func": "func:recordlinker.linking.matchers.compare_probabilistic_fuzzy_match"} ] - assert obj.passes[0].rule == "func:recordlinker.linking.matchers.rule_match" + assert obj.passes[0].rule == "func:recordlinker.linking.matchers.rule_probabilistic_match" def test_delete_algorithm(session): with pytest.raises(sqlalchemy.exc.SQLAlchemyError): algorithm_service.delete_algorithm(session, models.Algorithm()) - algo1 = models.Algorithm(label="basic", is_default=True, description="First algorithm") + algo1 = models.Algorithm(label="default", is_default=True, description="First algorithm") session.add(algo1) pass1 = models.AlgorithmPass( algorithm=algo1, blocking_keys=["FIRST_NAME"], evaluators=[ - {"feature": "ZIP", "func": "func:recordlinker.linking.matchers.compare_match_any"} + {"feature": "ZIP", "func": "func:recordlinker.linking.matchers.compare_probabilistic_fuzzy_match"} ], - rule="func:recordlinker.linking.matchers.rule_match", + rule="func:recordlinker.linking.matchers.rule_probabilistic_match", ) session.add(pass1) session.commit() @@ -155,17 +152,15 @@ def test_delete_algorithm(session): def test_clear_algorithms(session): - algo1 = models.Algorithm(label="basic", is_default=True, description="First algorithm") - algo2 = models.Algorithm(label="enhanced", description="Second algorithm") + algo1 = models.Algorithm(label="default", is_default=True, description="First algorithm") session.add(algo1) - session.add(algo2) pass1 = models.AlgorithmPass( algorithm=algo1, blocking_keys=["FIRST_NAME"], evaluators=[ - {"feature": "ZIP", "func": "func:recordlinker.linking.matchers.compare_match_any"} + {"feature": "ZIP", "func": "func:recordlinker.linking.matchers.compare_probabilistic_match"} ], - rule="func:recordlinker.linking.matchers.rule_match", + rule="func:recordlinker.linking.matchers.rule_probabilistic_match", ) session.add(pass1) session.commit() diff --git a/tests/unit/linking/test_link.py b/tests/unit/linking/test_link.py index 9433353a..2ea85499 100644 --- a/tests/unit/linking/test_link.py +++ b/tests/unit/linking/test_link.py @@ -50,11 +50,11 @@ def test_compare_match(self): algorithm_id=1, blocking_keys=[1], evaluators=[ - {"feature": "FIRST_NAME", "func": "func:recordlinker.linking.matchers.compare_match_all"}, - {"feature": "LAST_NAME", "func": "func:recordlinker.linking.matchers.compare_fuzzy_match"}, + {"feature": "FIRST_NAME", "func": "func:recordlinker.linking.matchers.compare_probabilistic_fuzzy_match"}, + {"feature": "LAST_NAME", "func": "func:recordlinker.linking.matchers.compare_probabilistic_fuzzy_match"}, ], - rule="func:recordlinker.linking.matchers.rule_match", - kwargs={}, + rule="func:recordlinker.linking.matchers.rule_probabilistic_match", + kwargs={"log_odds": {"FIRST_NAME": 6.85, "LAST_NAME": 6.35}, "true_match_threshold": 12}, ) assert link.compare(rec, pat, algorithm_pass) is True @@ -89,11 +89,11 @@ def test_compare_no_match(self): algorithm_id=1, blocking_keys=[1], evaluators=[ - {"feature": "FIRST_NAME", "func": "func:recordlinker.linking.matchers.compare_match_all"}, - {"feature": "LAST_NAME", "func": "func:recordlinker.linking.matchers.compare_match_all"}, + {"feature": "FIRST_NAME", "func": "func:recordlinker.linking.matchers.compare_probabilistic_fuzzy_match"}, + {"feature": "LAST_NAME", "func": "func:recordlinker.linking.matchers.compare_probabilistic_fuzzy_match"}, ], - rule="func:recordlinker.linking.matchers.rule_match", - kwargs={}, + rule="func:recordlinker.linking.matchers.rule_probabilistic_match", + kwargs={"log_odds": {"FIRST_NAME": 6.85, "LAST_NAME": 6.35}, "true_match_threshold": 12.95}, ) assert link.compare(rec, pat, algorithm_pass) is False @@ -137,10 +137,10 @@ def test_compare_identifier_match(self): algorithm_id=1, blocking_keys=[1], evaluators=[ - {"feature": "IDENTIFIER", "func": "func:recordlinker.linking.matchers.compare_match_all"}, + {"feature": "IDENTIFIER", "func": "func:recordlinker.linking.matchers.compare_probabilistic_fuzzy_match"}, ], - rule="func:recordlinker.linking.matchers.rule_match", - kwargs={}, + rule="func:recordlinker.linking.matchers.rule_probabilistic_match", + kwargs={"log_odds": {"IDENTIFIER": 0.35}, "true_match_threshold": 0.3}, ) assert link.compare(rec, pat, algorithm_pass) is True @@ -184,16 +184,16 @@ def test_compare_identifier_with_suffix(self): algorithm_id=1, blocking_keys=[1], evaluators=[ - {"feature": "IDENTIFIER:MR", "func": "func:recordlinker.linking.matchers.compare_match_all"}, + {"feature": "IDENTIFIER:MR", "func": "func:recordlinker.linking.matchers.compare_probabilistic_fuzzy_match"}, ], - rule="func:recordlinker.linking.matchers.rule_match", - kwargs={}, + rule="func:recordlinker.linking.matchers.rule_probabilistic_match", + kwargs={"log_odds": {"IDENTIFIER": 0.35}, "true_match_threshold": 0.3}, ) #should pass as MR is the same for both assert link.compare(rec, pat, algorithm_pass) is True - algorithm_pass.evaluators = [{"feature": "IDENTIFIER:SS", "func": "func:recordlinker.linking.matchers.compare_match_all"}] + algorithm_pass.evaluators = [{"feature": "IDENTIFIER:SS", "func": "func:recordlinker.linking.matchers.compare_probabilistic_fuzzy_match"}] #should fail as SS is different for both assert link.compare(rec, pat, algorithm_pass) is False @@ -228,9 +228,9 @@ def test_compare_invalid_feature(self): algorithm_id=1, blocking_keys=[1], evaluators=[ - {"feature": "FIRST_NAME:DL", "func": "func:recordlinker.linking.matchers.compare_match_all"}, + {"feature": "FIRST_NAME:DL", "func": "func:recordlinker.linking.matchers.compare_probabilistic_fuzzy_match"}, ], - rule="func:recordlinker.linking.matchers.rule_match", + rule="func:recordlinker.linking.matchers.rule_probabilistic_match", kwargs={}, ) @@ -249,20 +249,10 @@ def patients(self): if entry.get("resource", {}).get("resourceType", {}) == "Patient": patients.append(fhir.fhir_record_to_pii_record(entry["resource"])) return patients - - @pytest.fixture - def possible_match_basic_patients(self): - bundle = load_test_json_asset("possible_match_basic_patient_bundle.json") - patients = [] - patients: list[schemas.PIIRecord] = [] - for entry in bundle["entry"]: - if entry.get("resource", {}).get("resourceType", {}) == "Patient": - patients.append(fhir.fhir_record_to_pii_record(entry["resource"])) - return patients @pytest.fixture - def possible_match_enhanced_patients(self): - bundle = load_test_json_asset("possible_match_enhanced_patient_bundle.json") + def possible_match_default_patients(self): + bundle = load_test_json_asset("possible_match_default_patient_bundle.json") patients = [] patients: list[schemas.PIIRecord] = [] for entry in bundle["entry"]: @@ -280,12 +270,12 @@ def multiple_matches_patients(self): patients.append(fhir.fhir_record_to_pii_record(entry["resource"])) return patients - def test_basic_match_one(self, session, basic_algorithm, patients): + def test_default_match_one(self, session, default_algorithm, patients): # Test various null data values in incoming record matches: list[bool] = [] mapped_patients: dict[str, int] = collections.defaultdict(int) for data in patients[:2]: - (_, person, results, _) = link.link_record_against_mpi(data, session, basic_algorithm) + (_, person, results, _) = link.link_record_against_mpi(data, session, default_algorithm) matches.append(bool(person and results)) mapped_patients[person.reference_id] += 1 @@ -294,11 +284,11 @@ def test_basic_match_one(self, session, basic_algorithm, patients): assert matches == [False, True] assert sorted(list(mapped_patients.values())) == [2] - def test_basic_match_two(self, session, basic_algorithm, patients): + def test_default_match_two(self, session, default_algorithm, patients): matches: list[bool] = [] mapped_patients: dict[str, int] = collections.defaultdict(int) for data in patients: - (_, person, results, _) = link.link_record_against_mpi(data, session, basic_algorithm) + (_, person, results, _) = link.link_record_against_mpi(data, session, default_algorithm) matches.append(bool(person and results)) mapped_patients[person.reference_id] += 1 @@ -314,33 +304,7 @@ def test_basic_match_two(self, session, basic_algorithm, patients): assert matches == [False, True, False, True, False, False] assert sorted(list(mapped_patients.values())) == [1, 1, 1, 3] - def test_basic_possible_match( - self, - session, - basic_algorithm, - possible_match_basic_patients: list[schemas.PIIRecord] - ): - predictions: dict[str, dict] = collections.defaultdict(dict) - # Decrease Belongingness Ratio lower bound to catch Possible Match when Belongingness Ratio = 0.5 - for lower_bound in [0.5, 0.45]: # test >= lower bound - basic_algorithm.belongingness_ratio_lower_bound = lower_bound - for i, data in enumerate(possible_match_basic_patients): - (patient, person, results, prediction) = link.link_record_against_mpi(data, session, basic_algorithm) - predictions[i] = { - "patient": patient, - "person": person, - "results": results, - "prediction": prediction - } - # 1 Possible Match - assert not predictions[2]["person"] - assert len(predictions[2]["results"]) == 1 - assert predictions[2]["results"][0].person == predictions[0]["person"] - assert predictions[2]["results"][0].belongingness_ratio >= basic_algorithm.belongingness_ratio_lower_bound - assert predictions[2]["results"][0].belongingness_ratio < basic_algorithm.belongingness_ratio_upper_bound - assert predictions[2]["prediction"] == "possible_match" - - def test_enhanced_match_three(self, session, enhanced_algorithm, patients: list[schemas.PIIRecord]): + def test_default_match_three(self, session, default_algorithm, patients: list[schemas.PIIRecord]): # add an additional patient that will fuzzy match to patient 0 patient0_copy = copy.deepcopy(patients[0]) patient0_copy.external_id = str(uuid.uuid4()) @@ -349,7 +313,7 @@ def test_enhanced_match_three(self, session, enhanced_algorithm, patients: list[ matches: list[bool] = [] mapped_patients: dict[str, int] = collections.defaultdict(int) for data in patients: - (_, person, results, _) = link.link_record_against_mpi(data, session, enhanced_algorithm) + (_, person, results, _) = link.link_record_against_mpi(data, session, default_algorithm) matches.append(bool(person and results)) mapped_patients[person.reference_id] += 1 @@ -365,18 +329,18 @@ def test_enhanced_match_three(self, session, enhanced_algorithm, patients: list[ assert matches == [False, True, False, True, False, False, True] assert sorted(list(mapped_patients.values())) == [1, 1, 1, 4] - def test_enhanced_possible_match( + def test_default_possible_match( self, session, - enhanced_algorithm, - possible_match_enhanced_patients: list[schemas.PIIRecord] + default_algorithm, + possible_match_default_patients: list[schemas.PIIRecord] ): predictions: dict[str, dict] = collections.defaultdict(dict) # Decrease Belongingness Ratio lower bound to catch Possible Match when Belongingness Ratio = 0.5 for lower_bound in [0.5, 0.45]: # test >= lower bound - enhanced_algorithm.belongingness_ratio_lower_bound = lower_bound - for i, data in enumerate(possible_match_enhanced_patients): - (patient, person, results, prediction) = link.link_record_against_mpi(data, session, enhanced_algorithm) + default_algorithm.belongingness_ratio_lower_bound = lower_bound + for i, data in enumerate(possible_match_default_patients): + (patient, person, results, prediction) = link.link_record_against_mpi(data, session, default_algorithm) predictions[i] = { "patient": patient, "person": person, @@ -387,23 +351,23 @@ def test_enhanced_possible_match( assert not predictions[2]["person"] assert len(predictions[2]["results"]) == 1 assert predictions[2]["results"][0].person == predictions[0]["person"] - assert predictions[2]["results"][0].belongingness_ratio >= enhanced_algorithm.belongingness_ratio_lower_bound - assert predictions[2]["results"][0].belongingness_ratio < enhanced_algorithm.belongingness_ratio_upper_bound + assert predictions[2]["results"][0].belongingness_ratio >= default_algorithm.belongingness_ratio_lower_bound + assert predictions[2]["results"][0].belongingness_ratio < default_algorithm.belongingness_ratio_upper_bound assert predictions[2]["prediction"] == "possible_match" def test_include_multiple_matches_true( self, session, - basic_algorithm, + default_algorithm, multiple_matches_patients: list[schemas.PIIRecord] ): predictions: dict[str, dict] = collections.defaultdict(dict) # Adjust Belongingness Ratio bounds to catch Match when Belongingness Ratio = 0.5 - basic_algorithm.belongingness_ratio_lower_bound = 0.3 + default_algorithm.belongingness_ratio_lower_bound = 0.3 for upper_bound in [0.5, 0.45]: # test >= upper bound - basic_algorithm.belongingness_ratio_upper_bound = upper_bound + default_algorithm.belongingness_ratio_upper_bound = upper_bound for i, data in enumerate(multiple_matches_patients): - (patient, person, results, prediction) = link.link_record_against_mpi(data, session, basic_algorithm) + (patient, person, results, prediction) = link.link_record_against_mpi(data, session, default_algorithm) predictions[i] = { "patient": patient, "person": person, @@ -414,23 +378,23 @@ def test_include_multiple_matches_true( assert len(predictions[3]["results"]) == 2 assert predictions[3]["person"] == predictions[1]["person"] # Assign to Person with highest Belongingness Ratio (1.0) for match in predictions[2]["results"]: - assert match.belongingness_ratio >= basic_algorithm.belongingness_ratio_upper_bound + assert match.belongingness_ratio >= default_algorithm.belongingness_ratio_upper_bound assert predictions[3]["prediction"] == "match" def test_include_multiple_matches_false( self, session, - basic_algorithm, + default_algorithm, multiple_matches_patients: list[schemas.PIIRecord] ): predictions: dict[str, dict] = collections.defaultdict(dict) - basic_algorithm.include_multiple_matches = False + default_algorithm.include_multiple_matches = False # Adjust Belongingness Ratio bounds to catch Match when Belongingness Ratio = 0.5 - basic_algorithm.belongingness_ratio_lower_bound = 0.3 + default_algorithm.belongingness_ratio_lower_bound = 0.3 for upper_bound in [0.5, 0.45]: # test >= upper bound - basic_algorithm.belongingness_ratio_upper_bound = upper_bound + default_algorithm.belongingness_ratio_upper_bound = upper_bound for i, data in enumerate(multiple_matches_patients): - (patient, person, results, prediction) = link.link_record_against_mpi(data, session, basic_algorithm) + (patient, person, results, prediction) = link.link_record_against_mpi(data, session, default_algorithm) predictions[i] = { "patient": patient, "person": person, @@ -440,20 +404,20 @@ def test_include_multiple_matches_false( # 2 Matches, but only include 1 assert len(predictions[3]["results"]) == 1 assert predictions[3]["person"] == predictions[1]["person"] # Assign to Person with highest Belongingness Ratio (1.0) - assert predictions[3]["results"][0].belongingness_ratio >= basic_algorithm.belongingness_ratio_upper_bound + assert predictions[3]["results"][0].belongingness_ratio >= default_algorithm.belongingness_ratio_upper_bound assert predictions[3]["prediction"] == "match" - def test_no_persist(self, session, basic_algorithm, patients): + def test_no_persist(self, session, default_algorithm, patients): # First patient inserted into MPI, no match first = patients[0] - (pat1, per1, results, prediction) = link.link_record_against_mpi(first, session, basic_algorithm, persist=True) + (pat1, per1, results, prediction) = link.link_record_against_mpi(first, session, default_algorithm, persist=True) assert prediction == "no_match" assert pat1 is not None assert per1 is not None assert not results # Second patient not inserted into MPI, match first person second = patients[1] - (pat2, per2, results, prediction) = link.link_record_against_mpi(second, session, basic_algorithm, persist=False) + (pat2, per2, results, prediction) = link.link_record_against_mpi(second, session, default_algorithm, persist=False) assert prediction == "match" assert pat2 is None assert per2 is not None @@ -461,7 +425,7 @@ def test_no_persist(self, session, basic_algorithm, patients): assert results # Third patient not inserted into MPI, no match third = patients[2] - (pat3, per3, results, prediction) = link.link_record_against_mpi(third, session, basic_algorithm, persist=False) + (pat3, per3, results, prediction) = link.link_record_against_mpi(third, session, default_algorithm, persist=False) assert prediction == "no_match" assert pat3 is None assert per3 is None diff --git a/tests/unit/linking/test_matchers.py b/tests/unit/linking/test_matchers.py index b245841b..5f114c4e 100644 --- a/tests/unit/linking/test_matchers.py +++ b/tests/unit/linking/test_matchers.py @@ -55,99 +55,6 @@ def test_get_fuzzy_params(): assert matchers._get_fuzzy_params("first_name", **kwargs) == ("Levenshtein", 0.7) -def test_compare_match_any(): - record = schemas.PIIRecord( - name=[{"given": ["John", "Michael"], "family": "Smith"}, {"family": "Harrison"}], - birthDate="Jan 1 1980", - ) - pat1 = models.Patient( - data={"name": [{"given": ["John", "Michael"], "family": "Doe"}], "birthDate": "1980-01-01"} - ) - pat2 = models.Patient(data={"name": [{"given": ["Michael"], "family": "Smith"}], "sex": "male"}) - pat3 = models.Patient(data={"name": [{"family": "Smith"}, {"family": "Williams"}]}) - - assert matchers.compare_match_any(record, pat1, schemas.Feature(attribute=schemas.FeatureAttribute.GIVEN_NAME)) - assert matchers.compare_match_any(record, pat1, schemas.Feature(attribute=schemas.FeatureAttribute.FIRST_NAME)) - assert not matchers.compare_match_any(record, pat1, schemas.Feature(attribute=schemas.FeatureAttribute.LAST_NAME)) - assert matchers.compare_match_any(record, pat1, schemas.Feature(attribute=schemas.FeatureAttribute.BIRTHDATE)) - assert not matchers.compare_match_any(record, pat1, schemas.Feature(attribute=schemas.FeatureAttribute.ZIP)) - - assert not matchers.compare_match_any(record, pat2, schemas.Feature(attribute=schemas.FeatureAttribute.GIVEN_NAME)) - assert not matchers.compare_match_any(record, pat2, schemas.Feature(attribute=schemas.FeatureAttribute.FIRST_NAME)) - assert matchers.compare_match_any(record, pat2, schemas.Feature(attribute=schemas.FeatureAttribute.LAST_NAME)) - assert not matchers.compare_match_any(record, pat2, schemas.Feature(attribute=schemas.FeatureAttribute.SEX)) - assert not matchers.compare_match_any(record, pat1, schemas.Feature(attribute=schemas.FeatureAttribute.ZIP)) - - assert not matchers.compare_match_any(record, pat3, schemas.Feature(attribute=schemas.FeatureAttribute.GIVEN_NAME)) - assert not matchers.compare_match_any(record, pat3, schemas.Feature(attribute=schemas.FeatureAttribute.FIRST_NAME)) - assert matchers.compare_match_any(record, pat3, schemas.Feature(attribute=schemas.FeatureAttribute.LAST_NAME)) - assert not matchers.compare_match_any(record, pat3, schemas.Feature(attribute=schemas.FeatureAttribute.BIRTHDATE)) - - with pytest.raises(ValueError): - matchers.compare_match_any(record, pat1, "unknown") - - -def test_compare_match_all(): - record = schemas.PIIRecord( - name=[{"given": ["John"], "family": "Smith"}, {"family": "Harrison"}], - birthDate="December 31, 1999", - ) - pat1 = models.Patient( - data={"name": [{"given": ["John", "Michael"], "family": "Doe"}], "birthDate": "1999-12-31"} - ) - pat2 = models.Patient( - data={ - "name": [{"given": ["John"], "family": "Smith"}], - "sex": "male", - "address": [{"zipcode": "12345"}], - }, - ) - pat3 = models.Patient(data={"name": [{"family": "Smith"}, {"family": "Harrison"}]}) - - assert not matchers.compare_match_all(record, pat1, schemas.Feature(attribute=schemas.FeatureAttribute.GIVEN_NAME)) - assert matchers.compare_match_all(record, pat1, schemas.Feature(attribute=schemas.FeatureAttribute.FIRST_NAME)) - assert not matchers.compare_match_all(record, pat1, schemas.Feature(attribute=schemas.FeatureAttribute.LAST_NAME)) - assert matchers.compare_match_all(record, pat1, schemas.Feature(attribute=schemas.FeatureAttribute.BIRTHDATE)) - assert not matchers.compare_match_all(record, pat1, schemas.Feature(attribute=schemas.FeatureAttribute.ZIP)) - - assert matchers.compare_match_all(record, pat2, schemas.Feature(attribute=schemas.FeatureAttribute.GIVEN_NAME)) - assert matchers.compare_match_all(record, pat2, schemas.Feature(attribute=schemas.FeatureAttribute.FIRST_NAME)) - assert not matchers.compare_match_all(record, pat2, schemas.Feature(attribute=schemas.FeatureAttribute.LAST_NAME)) - assert not matchers.compare_match_all(record, pat2, schemas.Feature(attribute=schemas.FeatureAttribute.SEX)) - assert not matchers.compare_match_all(record, pat1, schemas.Feature(attribute=schemas.FeatureAttribute.ZIP)) - - assert not matchers.compare_match_all(record, pat3, schemas.Feature(attribute=schemas.FeatureAttribute.FIRST_NAME)) - assert matchers.compare_match_all(record, pat3, schemas.Feature(attribute=schemas.FeatureAttribute.LAST_NAME)) - assert not matchers.compare_match_all(record, pat3, schemas.Feature(attribute=schemas.FeatureAttribute.BIRTHDATE)) - - with pytest.raises(ValueError): - matchers.compare_match_all(record, pat1, "unknown") - - -def test_compare_fuzzy_match(): - record = schemas.PIIRecord( - name=[{"given": ["John"], "family": "Smith"}, {"family": "Harrison"}], - birthDate="1980-01-01", - ) - pat1 = models.Patient( - data={"name": [{"given": ["Jon", "Mike"], "family": "Doe"}], "birthDate": "Jan 1 1980"} - ) - pat2 = models.Patient(data={"name": [{"given": ["Michael"], "family": "Smtih"}], "sex": "male"}) - pat3 = models.Patient(data={"name": [{"family": "Smyth"}, {"family": "Williams"}]}) - - assert matchers.compare_fuzzy_match(record, pat1, schemas.Feature(attribute=schemas.FeatureAttribute.FIRST_NAME)) - assert not matchers.compare_fuzzy_match(record, pat1, schemas.Feature(attribute=schemas.FeatureAttribute.LAST_NAME)) - - assert not matchers.compare_fuzzy_match(record, pat2, schemas.Feature(attribute=schemas.FeatureAttribute.FIRST_NAME)) - assert matchers.compare_fuzzy_match(record, pat2, schemas.Feature(attribute=schemas.FeatureAttribute.LAST_NAME)) - - assert not matchers.compare_fuzzy_match(record, pat3, schemas.Feature(attribute=schemas.FeatureAttribute.FIRST_NAME)) - assert matchers.compare_fuzzy_match(record, pat3, schemas.Feature(attribute=schemas.FeatureAttribute.LAST_NAME)) - - with pytest.raises(ValueError): - matchers.compare_fuzzy_match(record, pat1, schemas.Feature(attribute="first_name")) - - def test_compare_probabilistic_exact_match(): with pytest.raises(ValueError): matchers.compare_probabilistic_exact_match( diff --git a/tests/unit/models/test_algorithm.py b/tests/unit/models/test_algorithm.py index 4ab9acad..eee88476 100644 --- a/tests/unit/models/test_algorithm.py +++ b/tests/unit/models/test_algorithm.py @@ -90,14 +90,14 @@ def test_from_dict_with_passes(self): "evaluators": [ { "feature": "FIRST_NAME", - "func": "func:recordlinker.linking.matchers.compare_match_all", + "func": "func:recordlinker.linking.matchers.compare_probabilistic_fuzzy_match", }, { "feature": "LAST_NAME", - "func": "func:recordlinker.linking.matchers.compare_match_all", + "func": "func:recordlinker.linking.matchers.compare_probabilistic_fuzzy_match", }, ], - "rule": "func:recordlinker.linking.matchers.rule_match", + "rule": "func:recordlinker.linking.matchers.rule_probabilistic_match", } ], } @@ -110,14 +110,14 @@ def test_from_dict_with_passes(self): assert algo.passes[0].evaluators == [ { "feature": "FIRST_NAME", - "func": "func:recordlinker.linking.matchers.compare_match_all", + "func": "func:recordlinker.linking.matchers.compare_probabilistic_fuzzy_match", }, { "feature": "LAST_NAME", - "func": "func:recordlinker.linking.matchers.compare_match_all", + "func": "func:recordlinker.linking.matchers.compare_probabilistic_fuzzy_match", }, ] - assert algo.passes[0].rule == "func:recordlinker.linking.matchers.rule_match" + assert algo.passes[0].rule == "func:recordlinker.linking.matchers.rule_probabilistic_match" class TestAlgorithmPass: @@ -129,21 +129,21 @@ def test_bound_evaluators(self): evaluators=[ { "feature": "BIRTHDATE", - "func": "func:recordlinker.linking.matchers.compare_match_any", + "func": "func:recordlinker.linking.matchers.compare_probabilistic_fuzzy_match", } ] ) assert ap.bound_evaluators() == [ - models.BoundEvaluator("BIRTHDATE", matchers.compare_match_any) + models.BoundEvaluator("BIRTHDATE", matchers.compare_probabilistic_fuzzy_match) ] ap.evaluators = [ { "feature": "BIRTHDATE", - "func": "func:recordlinker.linking.matchers.compare_match_all", + "func": "func:recordlinker.linking.matchers.compare_probabilistic_fuzzy_match", } ] assert ap.bound_evaluators() == [ - models.BoundEvaluator("BIRTHDATE", matchers.compare_match_all) + models.BoundEvaluator("BIRTHDATE", matchers.compare_probabilistic_fuzzy_match) ] ap.evaluators = [ {"feature": "BIRTHDATE", "func": "func:recordlinker.linking.matchers.invalid"} @@ -155,9 +155,7 @@ def test_bound_rule(self): """ Tests that the bound_rule method returns the correct function """ - ap = models.AlgorithmPass(rule="func:recordlinker.linking.matchers.rule_match") - assert ap.bound_rule() == matchers.rule_match - ap.rule = "func:recordlinker.linking.matchers.rule_probabilistic_match" + ap = models.AlgorithmPass(rule="func:recordlinker.linking.matchers.rule_probabilistic_match") assert ap.bound_rule() == matchers.rule_probabilistic_match ap.rule = "func:recordlinker.linking.matchers.invalid" with pytest.raises(ValueError, match="Failed to convert string to callable"): diff --git a/tests/unit/routes/test_algorithm_router.py b/tests/unit/routes/test_algorithm_router.py index 6533e665..427a455c 100644 --- a/tests/unit/routes/test_algorithm_router.py +++ b/tests/unit/routes/test_algorithm_router.py @@ -10,31 +10,21 @@ class TestListAlgorithms: def test_list(self, client): - algo1 = models.Algorithm(label="basic", is_default=True, description="First algorithm") - algo2 = models.Algorithm(label="enhanced", description="Second algorithm") + algo1 = models.Algorithm(label="default", is_default=True, description="First algorithm") client.session.add(algo1) - client.session.add(algo2) client.session.commit() response = client.get("/algorithm") assert response.status_code == 200 assert response.json() == [ { - "label": "basic", + "label": "default", "is_default": True, "description": "First algorithm", "include_multiple_matches": True, "belongingness_ratio": [1.0, 1.0], "pass_count": 0, }, - { - "label": "enhanced", - "is_default": False, - "description": "Second algorithm", - "include_multiple_matches": True, - "belongingness_ratio": [1.0, 1.0], - "pass_count": 0, - }, ] @@ -45,7 +35,8 @@ def test_404(self, client): def test_get(self, client): algo = models.Algorithm( - label="basic", + label="default", + is_default=True, description="First algorithm", belongingness_ratio=(0.25, 0.5), passes=[ @@ -56,11 +47,11 @@ def test_get(self, client): evaluators=[ { "feature": "FIRST_NAME", - "func": "func:recordlinker.linking.matchers.compare_fuzzy_match", + "func": "func:recordlinker.linking.matchers.compare_probabilistic_fuzzy_match", }, ], - rule="func:recordlinker.linking.matchers.rule_match", - kwargs={"similarity_measure": "JaroWinkler"}, + rule="func:recordlinker.linking.matchers.rule_probabilistic_match", + kwargs={"similarity_measure": "JaroWinkler", "log_odds": {"FIRST_NAME": 6.8}}, ) ], ) @@ -70,8 +61,8 @@ def test_get(self, client): response = client.get(f"/algorithm/{algo.label}") assert response.status_code == 200 assert response.json() == { - "label": "basic", - "is_default": False, + "label": "default", + "is_default": True, "description": "First algorithm", "include_multiple_matches": True, "belongingness_ratio": [0.25, 0.5], @@ -81,12 +72,13 @@ def test_get(self, client): "evaluators": [ { "feature": "FIRST_NAME", - "func": "func:recordlinker.linking.matchers.compare_fuzzy_match", + "func": "func:recordlinker.linking.matchers.compare_probabilistic_fuzzy_match", } ], - "rule": "func:recordlinker.linking.matchers.rule_match", + "rule": "func:recordlinker.linking.matchers.rule_probabilistic_match", "kwargs": { "similarity_measure": "JaroWinkler", + "log_odds": {"FIRST_NAME": 6.8} }, } ], @@ -99,7 +91,7 @@ def test_invalid_data(self, client): assert response.status_code == 422 def test_exsiting_default(self, client): - algo = models.Algorithm(label="basic", is_default=True, description="First algorithm") + algo = models.Algorithm(label="default", is_default=True, description="First algorithm") client.session.add(algo) client.session.commit() @@ -114,8 +106,8 @@ def test_exsiting_default(self, client): def test_create(self, client): payload = { - "label": "basic", - "description": "First algorithm", + "label": "created", + "description": "Created algorithm", "belongingness_ratio": (0.25, 0.5), "passes": [ { @@ -125,10 +117,10 @@ def test_create(self, client): "evaluators": [ { "feature": "FIRST_NAME", - "func": "func:recordlinker.linking.matchers.compare_fuzzy_match", + "func": "func:recordlinker.linking.matchers.compare_probabilistic_fuzzy_match", } ], - "rule": "func:recordlinker.linking.matchers.rule_match", + "rule": "func:recordlinker.linking.matchers.rule_probabilistic_match", } ], } @@ -136,28 +128,28 @@ def test_create(self, client): assert response.status_code == 201 algo = ( - client.session.query(models.Algorithm).filter(models.Algorithm.label == "basic").first() + client.session.query(models.Algorithm).filter(models.Algorithm.label == "created").first() ) - assert algo.label == "basic" + assert algo.label == "created" assert algo.is_default is False - assert algo.description == "First algorithm" + assert algo.description == "Created algorithm" assert algo.belongingness_ratio == (0.25, 0.5) assert len(algo.passes) == 1 assert algo.passes[0].blocking_keys == ["BIRTHDATE"] assert algo.passes[0].evaluators == [ { "feature": "FIRST_NAME", - "func": "func:recordlinker.linking.matchers.compare_fuzzy_match", + "func": "func:recordlinker.linking.matchers.compare_probabilistic_fuzzy_match", } ] - assert algo.passes[0].rule == "func:recordlinker.linking.matchers.rule_match" + assert algo.passes[0].rule == "func:recordlinker.linking.matchers.rule_probabilistic_match" assert algo.passes[0].kwargs == {} class TestUpdateAlgorithm: def test_404(self, client): payload = { - "label": "basic", + "label": "bad", "description": "First algorithm", "belongingness_ratio": (1.0, 1.0), "passes": [], @@ -166,22 +158,22 @@ def test_404(self, client): assert response.status_code == 404 def test_invalid_data(self, client): - algo = models.Algorithm(label="basic", description="First algorithm") + algo = models.Algorithm(label="default", description="First algorithm") client.session.add(algo) client.session.commit() - response = client.put("/algorithm/basic", json={}) + response = client.put("/algorithm/default", json={}) assert response.status_code == 422 def test_exsiting_default(self, client): - algo1 = models.Algorithm(label="default", is_default=True, description="algorithm") - algo2 = models.Algorithm(label="basic", is_default=False, description="First algorithm") + algo1 = models.Algorithm(label="default", is_default=True, description="Default algorithm") + algo2 = models.Algorithm(label="bonus", is_default=False, description="Extra algorithm") client.session.add(algo1) client.session.add(algo2) client.session.commit() payload = { - "label": "basic", + "label": "bonus", "is_default": True, "description": "new default algorithm", "passes": [], @@ -190,12 +182,13 @@ def test_exsiting_default(self, client): assert response.status_code == 422 def test_update(self, client): - algo = models.Algorithm(label="basic", description="First algorithm", passes=[]) + algo = models.Algorithm(label="default", description="First algorithm", passes=[]) client.session.add(algo) client.session.commit() payload = { - "label": "basic", + "label": "default", + "is_default": True, "description": "Updated algorithm", "belongingness_ratio": (0.25, 0.5), "passes": [ @@ -206,21 +199,21 @@ def test_update(self, client): "evaluators": [ { "feature": "FIRST_NAME", - "func": "func:recordlinker.linking.matchers.compare_fuzzy_match", + "func": "func:recordlinker.linking.matchers.compare_probabilistic_fuzzy_match", } ], - "rule": "func:recordlinker.linking.matchers.rule_match", + "rule": "func:recordlinker.linking.matchers.rule_probabilistic_match", } ], } - response = client.put("/algorithm/basic", json=payload) + response = client.put("/algorithm/default", json=payload) assert response.status_code == 200 algo = ( - client.session.query(models.Algorithm).filter(models.Algorithm.label == "basic").first() + client.session.query(models.Algorithm).filter(models.Algorithm.label == "default").first() ) - assert algo.label == "basic" - assert algo.is_default is False + assert algo.label == "default" + assert algo.is_default is True assert algo.description == "Updated algorithm" assert algo.belongingness_ratio == (0.25, 0.5) assert len(algo.passes) == 1 @@ -228,10 +221,10 @@ def test_update(self, client): assert algo.passes[0].evaluators == [ { "feature": "FIRST_NAME", - "func": "func:recordlinker.linking.matchers.compare_fuzzy_match", + "func": "func:recordlinker.linking.matchers.compare_probabilistic_fuzzy_match", } ] - assert algo.passes[0].rule == "func:recordlinker.linking.matchers.rule_match" + assert algo.passes[0].rule == "func:recordlinker.linking.matchers.rule_probabilistic_match" assert algo.passes[0].kwargs == {} @@ -241,14 +234,14 @@ def test_404(self, client): assert response.status_code == 404 def test_delete(self, client): - algo = models.Algorithm(label="basic", description="First algorithm") + algo = models.Algorithm(label="default", description="First algorithm") client.session.add(algo) client.session.commit() - response = client.delete("/algorithm/basic") + response = client.delete("/algorithm/default") assert response.status_code == 204 algo = ( - client.session.query(models.Algorithm).filter(models.Algorithm.label == "basic").first() + client.session.query(models.Algorithm).filter(models.Algorithm.label == "default").first() ) assert algo is None diff --git a/tests/unit/routes/test_link_router.py b/tests/unit/routes/test_link_router.py index cf6a7a82..0728b398 100644 --- a/tests/unit/routes/test_link_router.py +++ b/tests/unit/routes/test_link_router.py @@ -22,16 +22,14 @@ def test_algorithm_or_422(session): - algo1 = models.Algorithm(label="basic", is_default=True, description="First algorithm") - algo2 = models.Algorithm(label="enhanced", description="Second algorithm") + algo1 = models.Algorithm(label="default", is_default=True, description="First algorithm") session.add(algo1) - session.add(algo2) session.commit() - alg = link_router.algorithm_or_422(session, "enhanced") - assert alg.label == "enhanced" + alg = link_router.algorithm_or_422(session, "default") + assert alg.label == "default" alg = link_router.algorithm_or_422(session, None) - assert alg.label == "basic" + assert alg.label == "default" with pytest.raises(HTTPException) as exc: link_router.algorithm_or_422(session, "invalid") assert exc.value.status_code == 422 @@ -67,8 +65,8 @@ def patients(self): return patients @mock.patch("recordlinker.database.algorithm_service.default_algorithm") - def test_link_success(self, patched_subprocess, basic_algorithm, patients, client): - patched_subprocess.return_value = basic_algorithm + def test_link_success(self, patched_subprocess, default_algorithm, patients, client): + patched_subprocess.return_value = default_algorithm response_1 = client.post( "/link", json={"record": json.loads(patients[0].model_dump_json(exclude_none=True))} ) @@ -136,103 +134,6 @@ def test_link_success(self, patched_subprocess, basic_algorithm, patients, clien assert response_6.json()["prediction"] == "no_match" assert not response_6.json()["results"] - @mock.patch("recordlinker.database.algorithm_service.get_algorithm") - def test_link_enhanced_algorithm( - self, patched_subprocess, enhanced_algorithm, patients, client - ): - patched_subprocess.return_value = enhanced_algorithm - - response_1 = client.post( - "/link", - json={ - "record": json.loads(patients[0].model_dump_json(exclude_none=True)), - "algorithm": "dibbs-enhanced", - }, - ) - person_1 = response_1.json()["person_reference_id"] - assert response_1.json()["patient_reference_id"] and uuid.UUID( - response_1.json()["patient_reference_id"] - ) - assert person_1 - assert response_1.json()["prediction"] == "no_match" - assert not response_1.json()["results"] - - response_2 = client.post( - "/link", - json={ - "record": json.loads(patients[1].model_dump_json(exclude_none=True)), - "algorithm": "dibbs-enhanced", - }, - ) - person_2 = response_2.json()["person_reference_id"] - assert response_2.json()["patient_reference_id"] and uuid.UUID( - response_2.json()["patient_reference_id"] - ) - assert person_2 == person_1 - assert response_2.json()["prediction"] == "match" - assert len(response_2.json()["results"]) == 1 - - response_3 = client.post( - "/link", - json={ - "record": json.loads(patients[2].model_dump_json(exclude_none=True)), - "algorithm": "dibbs-enhanced", - }, - ) - person_3 = response_3.json()["person_reference_id"] - assert response_3.json()["patient_reference_id"] and uuid.UUID( - response_3.json()["patient_reference_id"] - ) - assert person_3 - assert response_3.json()["prediction"] == "no_match" - assert not response_3.json()["results"] - - # Cluster membership success--justified match - response_4 = client.post( - "/link", - json={ - "record": json.loads(patients[3].model_dump_json(exclude_none=True)), - "algorithm": "dibbs-enhanced", - }, - ) - person_4 = response_4.json()["person_reference_id"] - assert response_4.json()["patient_reference_id"] and uuid.UUID( - response_4.json()["patient_reference_id"] - ) - assert person_4 == person_1 - assert response_4.json()["prediction"] == "match" - assert len(response_4.json()["results"]) == 1 - - response_5 = client.post( - "/link", - json={ - "record": json.loads(patients[4].model_dump_json(exclude_none=True)), - "algorithm": "dibbs-enhanced", - }, - ) - person_5 = response_5.json()["person_reference_id"] - assert response_5.json()["patient_reference_id"] and uuid.UUID( - response_5.json()["patient_reference_id"] - ) - assert person_5 - assert response_5.json()["prediction"] == "no_match" - assert not response_5.json()["results"] - - response_6 = client.post( - "/link", - json={ - "record": json.loads(patients[5].model_dump_json(exclude_none=True)), - "algorithm": "dibbs-enhanced", - }, - ) - person_6 = response_6.json()["person_reference_id"] - assert response_6.json()["patient_reference_id"] and uuid.UUID( - response_6.json()["patient_reference_id"] - ) - assert person_6 - assert response_6.json()["prediction"] == "no_match" - assert not response_6.json()["results"] - @mock.patch("recordlinker.database.algorithm_service.get_algorithm") def test_link_invalid_algorithm_param(self, patched_subprocess, patients, client): patched_subprocess.return_value = None @@ -264,8 +165,8 @@ def test_link_no_default_algorithm(self, patched_subprocess, patients, client): class TestLinkFHIR: @mock.patch("recordlinker.database.algorithm_service.default_algorithm") - def test_bundle_with_no_patient(self, patched_subprocess, basic_algorithm, client): - patched_subprocess.return_value = basic_algorithm + def test_bundle_with_no_patient(self, patched_subprocess, default_algorithm, client): + patched_subprocess.return_value = default_algorithm bad_bundle = {"entry": []} expected_response = { "detail": "Supplied bundle contains no Patient resource", @@ -278,8 +179,8 @@ def test_bundle_with_no_patient(self, patched_subprocess, basic_algorithm, clien assert actual_response.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY @mock.patch("recordlinker.database.algorithm_service.default_algorithm") - def test_invalid_bundle(self, patched_subprocess, basic_algorithm, client): - patched_subprocess.return_value = basic_algorithm + def test_invalid_bundle(self, patched_subprocess, default_algorithm, client): + patched_subprocess.return_value = default_algorithm bad_bundle = {"entry": [{"resource": {"resourceType": "Patient", "name": "John Doe"}}]} expected_response = { "detail": "Invalid Patient resource", @@ -292,8 +193,8 @@ def test_invalid_bundle(self, patched_subprocess, basic_algorithm, client): assert actual_response.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY @mock.patch("recordlinker.database.algorithm_service.default_algorithm") - def test_success(self, patched_subprocess, basic_algorithm, client): - patched_subprocess.return_value = basic_algorithm + def test_success(self, patched_subprocess, default_algorithm, client): + patched_subprocess.return_value = default_algorithm test_bundle = load_test_json_asset("patient_bundle_to_link_with_mpi.json") entry_list = copy.deepcopy(test_bundle["entry"]) @@ -396,110 +297,6 @@ def test_success(self, patched_subprocess, basic_algorithm, client): assert resp_6.json()["prediction"] == "no_match" assert not resp_6.json()["results"] - @mock.patch("recordlinker.database.algorithm_service.get_algorithm") - def test_enhanced_algo(self, patched_subprocess, enhanced_algorithm, client): - patched_subprocess.return_value = enhanced_algorithm - test_bundle = load_test_json_asset("patient_bundle_to_link_with_mpi.json") - entry_list = copy.deepcopy(test_bundle["entry"]) - - bundle_1 = test_bundle - bundle_1["entry"] = [entry_list[0]] - resp_1 = client.post("/link/fhir", json={"bundle": bundle_1, "algorithm": "dibbs-enhanced"}) - new_bundle = resp_1.json()["updated_bundle"] - person_1 = [ - r.get("resource") - for r in new_bundle["entry"] - if r.get("resource").get("resourceType") == "Person" - ][0] - assert resp_1.json()["patient_reference_id"] and uuid.UUID( - resp_1.json()["patient_reference_id"] - ) - assert resp_1.json()["person_reference_id"] == person_1.get("id") - assert resp_1.json()["prediction"] == "no_match" - assert not resp_1.json()["results"] - - bundle_2 = test_bundle - bundle_2["entry"] = [entry_list[1]] - resp_2 = client.post("/link/fhir", json={"bundle": bundle_2, "algorithm": "dibbs-enhanced"}) - new_bundle = resp_2.json()["updated_bundle"] - person_2 = [ - r.get("resource") - for r in new_bundle["entry"] - if r.get("resource").get("resourceType") == "Person" - ][0] - assert resp_2.json()["patient_reference_id"] and uuid.UUID( - resp_2.json()["patient_reference_id"] - ) - assert resp_2.json()["person_reference_id"] == person_2.get("id") - assert person_2.get("id") == person_1.get("id") - assert resp_2.json()["prediction"] == "match" - assert len(resp_2.json()["results"]) == 1 - - bundle_3 = test_bundle - bundle_3["entry"] = [entry_list[2]] - resp_3 = client.post("/link/fhir", json={"bundle": bundle_3, "algorithm": "dibbs-enhanced"}) - new_bundle = resp_3.json()["updated_bundle"] - person_3 = [ - r.get("resource") - for r in new_bundle["entry"] - if r.get("resource").get("resourceType") == "Person" - ][0] - assert resp_3.json()["patient_reference_id"] and uuid.UUID( - resp_3.json()["patient_reference_id"] - ) - assert resp_3.json()["person_reference_id"] == person_3.get("id") - assert resp_3.json()["prediction"] == "no_match" - assert not resp_3.json()["results"] - - bundle_4 = test_bundle - bundle_4["entry"] = [entry_list[3]] - resp_4 = client.post("/link/fhir", json={"bundle": bundle_4, "algorithm": "dibbs-enhanced"}) - new_bundle = resp_4.json()["updated_bundle"] - person_4 = [ - r.get("resource") - for r in new_bundle["entry"] - if r.get("resource").get("resourceType") == "Person" - ][0] - assert resp_4.json()["patient_reference_id"] and uuid.UUID( - resp_4.json()["patient_reference_id"] - ) - assert resp_4.json()["person_reference_id"] == person_1.get("id") - assert person_4.get("id") == person_1.get("id") - assert resp_4.json()["prediction"] == "match" - assert len(resp_4.json()["results"]) == 1 - - bundle_5 = test_bundle - bundle_5["entry"] = [entry_list[4]] - resp_5 = client.post("/link/fhir", json={"bundle": bundle_5, "algorithm": "dibbs-enhanced"}) - new_bundle = resp_5.json()["updated_bundle"] - person_5 = [ - r.get("resource") - for r in new_bundle["entry"] - if r.get("resource").get("resourceType") == "Person" - ][0] - assert resp_5.json()["patient_reference_id"] and uuid.UUID( - resp_5.json()["patient_reference_id"] - ) - assert resp_5.json()["person_reference_id"] == person_5.get("id") - assert resp_5.json()["prediction"] == "no_match" - assert not resp_5.json()["results"] - - bundle_6 = test_bundle - bundle_6["entry"] = [entry_list[5]] - resp_6 = client.post("/link/fhir", json={"bundle": bundle_6, "algorithm": "dibbs-enhanced"}) - new_bundle = resp_6.json()["updated_bundle"] - person_6 = [ - r.get("resource") - for r in new_bundle["entry"] - if r.get("resource").get("resourceType") == "Person" - ][0] - assert resp_6.json()["patient_reference_id"] and uuid.UUID( - resp_6.json()["patient_reference_id"] - ) - assert resp_6.json()["person_reference_id"] == person_6.get("id") - assert resp_6.json()["prediction"] == "no_match" - assert not resp_6.json()["results"] - @mock.patch("recordlinker.database.algorithm_service.get_algorithm") def test_invalid_algorithm_param(self, patched_subprocess, client): patched_subprocess.return_value = None @@ -531,8 +328,8 @@ def test_invalid_algorithm(self, client): assert resp.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY assert resp.json()["detail"] == "No algorithm found" - def test_no_match(self, client, basic_algorithm, patients): - client.session.add(basic_algorithm) + def test_no_match(self, client, default_algorithm, patients): + client.session.add(default_algorithm) client.session.commit() resp = client.post("/match", json={"record": patients[0].to_dict(True)}) assert resp.status_code == status.HTTP_200_OK @@ -543,8 +340,8 @@ def test_no_match(self, client, basic_algorithm, patients): assert payload.get("patient_reference_id") is None assert len(client.session.query(models.Patient).all()) == 0 - def test_match(self, client, basic_algorithm, patients): - client.session.add(basic_algorithm) + def test_match(self, client, default_algorithm, patients): + client.session.add(default_algorithm) client.session.commit() per1 = client.post("/link", json={"record": patients[0].to_dict(True)}).json()["person_reference_id"] @@ -573,8 +370,8 @@ def test_invalid_algorithm(self, client): assert resp.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY assert resp.json()["detail"] == "No algorithm found" - def test_no_match(self, client, basic_algorithm, patient_bundles): - client.session.add(basic_algorithm) + def test_no_match(self, client, default_algorithm, patient_bundles): + client.session.add(default_algorithm) client.session.commit() resp = client.post("/match/fhir", json={"bundle": patient_bundles[0]}) assert resp.status_code == status.HTTP_200_OK @@ -586,8 +383,8 @@ def test_no_match(self, client, basic_algorithm, patient_bundles): assert payload["updated_bundle"] is None assert len(client.session.query(models.Patient).all()) == 0 - def test_match(self, client, basic_algorithm, patient_bundles): - client.session.add(basic_algorithm) + def test_match(self, client, default_algorithm, patient_bundles): + client.session.add(default_algorithm) client.session.commit() per1 = client.post("/link/fhir", json={"bundle": patient_bundles[0]}).json()["person_reference_id"] diff --git a/tests/unit/routes/test_seed_router.py b/tests/unit/routes/test_seed_router.py index 10e0fa31..ea73481a 100644 --- a/tests/unit/routes/test_seed_router.py +++ b/tests/unit/routes/test_seed_router.py @@ -43,8 +43,8 @@ def test_large_batch(self, client): assert client.session.query(models.BlockingValue).count() == 12139 @mock.patch("recordlinker.database.algorithm_service.default_algorithm") - def test_seed_and_link(self, mock_algorithm, basic_algorithm, client): - mock_algorithm.return_value = basic_algorithm + def test_seed_and_link(self, mock_algorithm, default_algorithm, client): + mock_algorithm.return_value = default_algorithm record = { "birth_date": "1956-09-06", "sex": "F", diff --git a/tests/unit/schemas/test_algorithm.py b/tests/unit/schemas/test_algorithm.py index 721d0494..042d8bfa 100644 --- a/tests/unit/schemas/test_algorithm.py +++ b/tests/unit/schemas/test_algorithm.py @@ -19,25 +19,25 @@ def test_validate_blocking_keys(self): AlgorithmPass( blocking_keys=keys, evaluators=[], - rule="func:recordlinker.linking.matchers.rule_match", + rule="func:recordlinker.linking.matchers.rule_probabilistic_match", ) keys = ["LAST_NAME", "BIRTHDATE", "ZIP"] # write an assertion that no exception is raised AlgorithmPass( blocking_keys=keys, evaluators=[], - rule="func:recordlinker.linking.matchers.rule_match", + rule="func:recordlinker.linking.matchers.rule_probabilistic_match", ) def test_validate_evaluators(self): evaluators = [ - {"feature": "name", "func": "func:recordlinker.linking.matchers.compare_match_any"} + {"feature": "name", "func": "func:recordlinker.linking.matchers.compare_probabilistic_fuzzy_match"} ] with pytest.raises(pydantic.ValidationError): AlgorithmPass( blocking_keys=[], evaluators=evaluators, - rule="func:recordlinker.linking.matchers.rule_match", + rule="func:recordlinker.linking.matchers.rule_probabilistic_match", ) evaluators = [ {"feature": "LAST_NAME", "func": "func:recordlinker.linking.matchers.unknown"} @@ -46,38 +46,38 @@ def test_validate_evaluators(self): AlgorithmPass( blocking_keys=[], evaluators=evaluators, - rule="func:recordlinker.linking.matchers.rule_match", + rule="func:recordlinker.linking.matchers.rule_probabilistic_match", ) evaluators = [ { "feature": "LAST_NAME", - "func": "func:recordlinker.linking.matchers.rule_match", + "func": "func:recordlinker.linking.matchers.rule_probabilistic_match", } ] with pytest.raises(pydantic.ValidationError): AlgorithmPass( blocking_keys=[], evaluators=evaluators, - rule="func:recordlinker.linking.matchers.rule_match", + rule="func:recordlinker.linking.matchers.rule_probabilistic_match", ) evaluators = [ - {"feature": "LAST_NAME", "func": "func:recordlinker.linking.matchers.compare_match_any"} + {"feature": "LAST_NAME", "func": "func:recordlinker.linking.matchers.compare_probabilistic_fuzzy_match"} ] # write an assertion that no exception is raised AlgorithmPass( blocking_keys=[], evaluators=evaluators, - rule="func:recordlinker.linking.matchers.rule_match", + rule="func:recordlinker.linking.matchers.rule_probabilistic_match", ) evaluators = [ - {"feature": "FIRST_NAME:DL", "func": "func:recordlinker.linking.matchers.compare_match_any"} + {"feature": "FIRST_NAME:DL", "func": "func:recordlinker.linking.matchers.compare_probabilistic_fuzzy_match"} ] with pytest.raises(pydantic.ValidationError): AlgorithmPass( blocking_keys=[], evaluators=evaluators, - rule="func:recordlinker.linking.matchers.rule_match", + rule="func:recordlinker.linking.matchers.rule_probabilistic_match", ) def test_validate_rule(self): @@ -88,14 +88,7 @@ def test_validate_rule(self): evaluators=[], rule=rule, ) - rule = "func:recordlinker.linking.matchers.compare_match_any" - with pytest.raises(pydantic.ValidationError): - AlgorithmPass( - blocking_keys=[], - evaluators=[], - rule=rule, - ) - rule = "func:recordlinker.linking.matchers.rule_match" + rule = "func:recordlinker.linking.matchers.rule_probabilistic_match" AlgorithmPass( blocking_keys=[], evaluators=[], @@ -107,14 +100,14 @@ def test_kwargs(self): AlgorithmPass( blocking_keys=[], evaluators=[], - rule="func:recordlinker.linking.matchers.rule_match", + rule="func:recordlinker.linking.matchers.rule_probabilistic_match", cluster_ratio=0.5, kwargs={"invalid": "key"}, ) AlgorithmPass( blocking_keys=[], evaluators=[], - rule="func:recordlinker.linking.matchers.rule_match", + rule="func:recordlinker.linking.matchers.rule_probabilistic_match", kwargs={ "similarity_measure": "JaroWinkler", "thresholds": {"CITY": 0.95, "ADDRESS": 0.98}, @@ -136,7 +129,7 @@ def test_validate_belongingness_ratio(self): AlgorithmPass( blocking_keys=[], evaluators=[], - rule="func:recordlinker.linking.matchers.rule_match", + rule="func:recordlinker.linking.matchers.rule_probabilistic_match", ) ], ) @@ -148,7 +141,7 @@ def test_validate_belongingness_ratio(self): AlgorithmPass( blocking_keys=[], evaluators=[], - rule="func:recordlinker.linking.matchers.rule_match", + rule="func:recordlinker.linking.matchers.rule_probabilistic_match", ) ], ) @@ -160,7 +153,7 @@ def test_validate_belongingness_ratio(self): AlgorithmPass( blocking_keys=[], evaluators=[], - rule="func:recordlinker.linking.matchers.rule_match", + rule="func:recordlinker.linking.matchers.rule_probabilistic_match", ) ], ) diff --git a/tests/unit/utils/test_functools.py b/tests/unit/utils/test_functools.py index a3ddf4d2..ed4c3166 100644 --- a/tests/unit/utils/test_functools.py +++ b/tests/unit/utils/test_functools.py @@ -6,46 +6,46 @@ def test_bind_functions(): funcs = { - "first_name": "func:recordlinker.linking.matchers.compare_fuzzy_match", - "last_name": "func:recordlinker.linking.matchers.compare_match_all", + "first_name": "func:recordlinker.linking.matchers.compare_probabilistic_fuzzy_match", + "last_name": "func:recordlinker.linking.matchers.compare_probabilistic_exact_match", } assert utils.bind_functions(funcs) == { - "first_name": matchers.compare_fuzzy_match, - "last_name": matchers.compare_match_all, + "first_name": matchers.compare_probabilistic_fuzzy_match, + "last_name": matchers.compare_probabilistic_exact_match, } funcs = { "blocks": [ {"value": "birthdate"}, - {"value": "func:recordlinker.linking.matchers.compare_match_all"}, + {"value": "func:recordlinker.linking.matchers.compare_probabilistic_fuzzy_match"}, ] } assert utils.bind_functions(funcs) == { "blocks": [ {"value": "birthdate"}, - {"value": matchers.compare_match_all}, + {"value": matchers.compare_probabilistic_fuzzy_match}, ] } funcs = { "nested": { - "first_name": "func:recordlinker.linking.matchers.compare_fuzzy_match", - "last_name": "func:recordlinker.linking.matchers.compare_match_all", + "first_name": "func:recordlinker.linking.matchers.compare_probabilistic_fuzzy_match", + "last_name": "func:recordlinker.linking.matchers.compare_probabilistic_exact_match", } } assert utils.bind_functions(funcs) == { "nested": { - "first_name": matchers.compare_fuzzy_match, - "last_name": matchers.compare_match_all, + "first_name": matchers.compare_probabilistic_fuzzy_match, + "last_name": matchers.compare_probabilistic_exact_match, } } def test_str_to_callable(): - val = "func:recordlinker.linking.matchers.compare_match_all" - assert utils.str_to_callable(val) == matchers.compare_match_all - val = "recordlinker.linking.matchers.compare_match_all" - assert utils.str_to_callable(val) == matchers.compare_match_all + val = "func:recordlinker.linking.matchers.compare_probabilistic_fuzzy_match" + assert utils.str_to_callable(val) == matchers.compare_probabilistic_fuzzy_match + val = "recordlinker.linking.matchers.compare_probabilistic_exact_match" + assert utils.str_to_callable(val) == matchers.compare_probabilistic_exact_match val = "recordlinker.unknown_module.unknown_function" with pytest.raises(ValueError): utils.str_to_callable(val) @@ -56,10 +56,10 @@ def test_str_to_callable(): def test_func_to_str(): assert ( - utils.func_to_str(matchers.compare_match_all) - == "func:recordlinker.linking.matchers.compare_match_all" + utils.func_to_str(matchers.compare_probabilistic_fuzzy_match) + == "func:recordlinker.linking.matchers.compare_probabilistic_fuzzy_match" ) assert ( - utils.func_to_str(matchers.compare_fuzzy_match) - == "func:recordlinker.linking.matchers.compare_fuzzy_match" + utils.func_to_str(matchers.compare_probabilistic_exact_match) + == "func:recordlinker.linking.matchers.compare_probabilistic_exact_match" ) From c7aa14d09b0f63a37c6006524bb8a13df496eb79 Mon Sep 17 00:00:00 2001 From: Eric Buckley Date: Thu, 13 Feb 2025 13:05:53 -0800 Subject: [PATCH 2/2] add custom api docs template to address a11y issues (#216) ## Description Developing a custom Handlebars template for Redoc to generate API documentation. This template enables us to override CSS rules and execute JavaScript functions to modify HTML tag attributes, aiming to resolve known accessibility (a11y) issues. ## Related Issues closes #198 ## Additional Notes Using [axe devtools](https://www.deque.com/axe/devtools/) to identify and help correct known a11y issues. These changes have resolved all known serious, critical, moderate and minor issues according to axe. --- docs/api_template.hbs | 56 +++++++++++++++++++++++++++++++++++++++++++ scripts/build_docs.sh | 2 +- 2 files changed, 57 insertions(+), 1 deletion(-) create mode 100644 docs/api_template.hbs diff --git a/docs/api_template.hbs b/docs/api_template.hbs new file mode 100644 index 00000000..650654d2 --- /dev/null +++ b/docs/api_template.hbs @@ -0,0 +1,56 @@ +{{!-- This is the default template for the ReDoc API documentation. --}} +{{!-- You can customize it be following the instructions at https://redocly.com/docs/cli/commands/build-docs --}} +{{!-- The template uses Handlebars.js syntax: https://handlebarsjs.com/ --}} +{{!-- Styling and attribute changes have been made to account for a11y issues --}} + + + + + + {{title}} + + + + {{{redocHead}}} + {{#unless disableGoogleFont}}{{/unless}} + + + + {{{redocHTML}}} + + + + diff --git a/scripts/build_docs.sh b/scripts/build_docs.sh index 0cf62775..609e7765 100755 --- a/scripts/build_docs.sh +++ b/scripts/build_docs.sh @@ -15,4 +15,4 @@ SITE_NAME="RecordLinker Documentation (${VERSION})" SITE_NAME=${SITE_NAME} mkdocs build --config-file docs/mkdocs.yml -d "../${OUT}" python -m recordlinker.utils.openapi_schema > ${OUT}/openapi.json -npx @redocly/cli build-docs -o "${OUT}/api-docs.html" "${OUT}/openapi.json" +npx @redocly/cli build-docs -t docs/api_template.hbs -o "${OUT}/api-docs.html" "${OUT}/openapi.json"