From 64996cb2893376529be43a637cc8d54914d96321 Mon Sep 17 00:00:00 2001 From: Eric Buckley Date: Tue, 17 Sep 2024 21:28:33 -0700 Subject: [PATCH 01/19] converting BlockingKey to an Enum --- ...8f1d41fad_convert_blockingkey_into_enum.py | 57 ++++++++++ src/recordlinker/linkage/models.py | 102 +++++++++++++++--- 2 files changed, 142 insertions(+), 17 deletions(-) create mode 100644 alembic/versions/ad18f1d41fad_convert_blockingkey_into_enum.py diff --git a/alembic/versions/ad18f1d41fad_convert_blockingkey_into_enum.py b/alembic/versions/ad18f1d41fad_convert_blockingkey_into_enum.py new file mode 100644 index 00000000..e57acd1a --- /dev/null +++ b/alembic/versions/ad18f1d41fad_convert_blockingkey_into_enum.py @@ -0,0 +1,57 @@ +"""convert BlockingKey into enum + +Revision ID: ad18f1d41fad +Revises: 6052c193a26a +Create Date: 2024-09-17 21:15:37.714595 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = 'ad18f1d41fad' +down_revision: Union[str, None] = '6052c193a26a' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.drop_table('mpi_blocking_key') + op.drop_table('mpi_blocking_value') + + op.create_table('mpi_blocking_value', + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('patient_id', sa.Integer(), nullable=False), + sa.Column('blockingkey', sa.Integer(), nullable=False), + sa.Column('value', sa.String(length=50), nullable=False), + sa.ForeignKeyConstraint(['patient_id'], ['mpi_patient.id'], ), + sa.PrimaryKeyConstraint('id') + ) + op.create_index('idx_blocking_value_patient_key_value', 'mpi_blocking_value', ['patient_id', 'blockingkey', 'value'], unique=False) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.drop_table('mpi_blocking_value') + + op.create_table('mpi_blocking_key', + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('key', sa.String(length=50), nullable=False), + sa.PrimaryKeyConstraint('id') + ) + op.create_table('mpi_blocking_value', + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('patient_id', sa.Integer(), nullable=False), + sa.Column('blockingkey_id', sa.Integer(), nullable=False), + sa.Column('value', sa.String(length=50), nullable=False), + sa.ForeignKeyConstraint(['blockingkey_id'], ['mpi_blocking_key.id'], ), + sa.ForeignKeyConstraint(['patient_id'], ['mpi_patient.id'], ), + sa.PrimaryKeyConstraint('id') + ) + op.create_index(op.f('ix_mpi_blocking_value_value'), 'mpi_blocking_value', ['value'], unique=False) + # ### end Alembic commands ### diff --git a/src/recordlinker/linkage/models.py b/src/recordlinker/linkage/models.py index 1d4c30a5..98c42b51 100644 --- a/src/recordlinker/linkage/models.py +++ b/src/recordlinker/linkage/models.py @@ -1,9 +1,24 @@ +import enum +import typing import uuid -from sqlalchemy import ForeignKey -from sqlalchemy import JSON +import dotenv +from sqlalchemy import create_engine from sqlalchemy import orm -from sqlalchemy import String +from sqlalchemy import schema +from sqlalchemy import types as sqltypes + + +def get_session() -> orm.Session: + """ + Creates a new session to the MPI database and returns it. + """ + db_uri = dotenv.dotenv_values()["DB_URI"] + if not db_uri: + raise ValueError("DB_URI environment variable not set") + engine = create_engine(db_uri) + Base.metadata.create_all(engine) + return orm.Session(engine) class Base(orm.DeclarativeBase): @@ -21,29 +36,82 @@ class ExternalPerson(Base): __tablename__ = "mpi_external_person" id: orm.Mapped[int] = orm.mapped_column(primary_key=True) - person_id: orm.Mapped[int] = orm.mapped_column(ForeignKey("mpi_person.id")) - external_id: orm.Mapped[str] = orm.mapped_column(String(255)) - source: orm.Mapped[str] = orm.mapped_column(String(255)) + person_id: orm.Mapped[int] = orm.mapped_column(schema.ForeignKey("mpi_person.id")) + external_id: orm.Mapped[str] = orm.mapped_column(sqltypes.String(255)) + source: orm.Mapped[str] = orm.mapped_column(sqltypes.String(255)) class Patient(Base): __tablename__ = "mpi_patient" id: orm.Mapped[int] = orm.mapped_column(primary_key=True) - person_id: orm.Mapped[int] = orm.mapped_column(ForeignKey("mpi_person.id")) - data: orm.Mapped[dict] = orm.mapped_column(JSON) - + person_id: orm.Mapped[int] = orm.mapped_column(schema.ForeignKey("mpi_person.id")) + data: orm.Mapped[dict] = orm.mapped_column(sqltypes.JSON) + + +class BlockingKey(enum.Enum): + """ + Enum for the different types of blocking keys that can be used for patient + matching. This is the universe of all possible blocking keys that a user can + choose from when configuring their algorithm. When data is loaded into the + MPI, all possible BlockingValues will be created for the defined BlockingKeys. + However, only a subset will be used in matching, based on the configuration of + the algorithm. By defining them all upfront, we give the user flexibility in + adjusting their algorithm configuration without having to reload the data. + + **HERE BE DRAGONS**: IN A REAL SYSTEM, THESE ENUMS SHOULD NOT BE CHANGED!!! + """ + BIRTHDATE = 1, "Date of Birth" + MRN = 2, "Last 4 chars of MRN" + SEX = 3, "Gender" + ZIP = 4, "Zip Code" + FIRST_NAME = 5, "First 4 chars of First Name" + LAST_NAME = 6, "First 4 chars of Last Name" + + def __init__(self, id: int, description: str): + self.id = id + self.description = description + + def to_value(self, data: dict) -> typing.Generator[str, None, None]: + """ + Given a data dictionary of Patient PII data, return a generator of all + possible values for this Key. Many Keys will only have 1 possible value, + but some (like first name) could have multiple values. + """ + if self == BlockingKey.BIRTHDATE: + if "birthdate" in data: + yield data["birthdate"].isoformat() + if self == BlockingKey.MRN: + if "mrn" in data: + yield data["mrn"][-4:] + if self == BlockingKey.SEX: + if "sex" in data: + # TODO: Normalize this + yield data["sex"] + if self == BlockingKey.ZIP: + for address in data.get("address", []): + if "zip" in address: + yield address["zip"] + if self == BlockingKey.FIRST_NAME: + for name in data.get("name", []): + for given in name.get("given", []): + yield given[:4] + if self == BlockingKey.LAST_NAME: + for name in data.get("name", []): + if "family" in name: + yield name.get("family")[:4] -class BlockingKey(Base): - __tablename__ = "mpi_blocking_key" - - id: orm.Mapped[int] = orm.mapped_column(primary_key=True) - key: orm.Mapped[str] = orm.mapped_column(String(50), index=True) class BlockingValue(Base): __tablename__ = "mpi_blocking_value" + # create a composite index on patient_id, blockingkey and value + __table_args__ = ( + schema.Index( + "idx_blocking_value_patient_key_value", "patient_id", "blockingkey", "value" + ), + ) id: orm.Mapped[int] = orm.mapped_column(primary_key=True) - patient_id: orm.Mapped[int] = orm.mapped_column(ForeignKey("mpi_patient.id")) - blockingkey_id: orm.Mapped[int] = orm.mapped_column(ForeignKey("mpi_blocking_key.id")) - value: orm.Mapped[str] = orm.mapped_column(String(50), index=True) + patient_id: orm.Mapped[int] = orm.mapped_column(schema.ForeignKey("mpi_patient.id")) + blockingkey: orm.Mapped[int] = orm.mapped_column(sqltypes.Integer) + value: orm.Mapped[str] = orm.mapped_column(sqltypes.String(50), index=True) From f524140b4a5c9f8ffc778edd9a7e5481ff554183 Mon Sep 17 00:00:00 2001 From: Eric Buckley Date: Tue, 17 Sep 2024 21:29:04 -0700 Subject: [PATCH 02/19] new implementations of get_block_data and insert_matched_patient for new schema --- src/recordlinker/linkage/simple_mpi.py | 112 +++++++++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100644 src/recordlinker/linkage/simple_mpi.py diff --git a/src/recordlinker/linkage/simple_mpi.py b/src/recordlinker/linkage/simple_mpi.py new file mode 100644 index 00000000..9cc740eb --- /dev/null +++ b/src/recordlinker/linkage/simple_mpi.py @@ -0,0 +1,112 @@ +""" +recordlinker.linkage.simple_mpi +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This module provides the data access functions to the MPI tables +""" + +import typing + +from sqlalchemy import orm +from sqlalchemy.sql import expression + +from recordlinker.linkage import models + + +def get_block_data( + session: orm.Session, data: dict, algo_config: dict +) -> list[models.Patient]: + """ + Get all of the matching Patients for the given data using the provided + blocking keys defined in the algo_config. + """ + # Create the base query + query = session.query(models.Patient) + + # Build the join criteria + for block in algo_config["blocks"]: + key_name = block["value"].upper() + # Get the matching Blocking Key based on the value in the algo_config + assert hasattr(models.BlockingKey, key_name), f"Invalid Blocking Key: {block}" + key = models.BlockingKey[key_name] + # Get all the possible values from the data for this key + vals = [v for v in key.to_value(data)] + # Add a join clause to the mpi_blocking_value table for each Blocking Key. + # This results in multiple joins to the same table, one for each Key, but + # with different joining conditions. + query = query.join( + models.BlockingValue, + expression.and_( + models.Patient.id == models.BlockingValue.patient_id, + models.BlockingValue.blockingkey == key.id, + models.BlockingValue.value.in_(vals), + ), + ) + + return query.all() + + +def insert_matched_patient( + session: orm.Session, + data: dict, + person_id: typing.Optional[int], + external_person_id: typing.Optional[str], + commit: bool = True, +) -> models.Patient: + """ + Insert a new patient record into the database. + """ + if person_id is None: + # create a new Person record + person = models.Person() + session.add(person) + # flush the session to get the person_id + session.flush() + person_id = person.id + + if external_person_id is not None: + # create a new ExternalPerson record if it doesn't exist + external_person = models.ExternalPerson( + person_id=person_id, + external_id=external_person_id, + source="IRIS", + ) + session.merge(external_person) + + # create a new Patient record + patient = models.Patient(person_id=person_id, data=data) + session.add(patient) + + # insert blocking keys + insert_blocking_keys(session, patient, commit=False) + + if commit: + session.commit() + return patient + + +def insert_blocking_keys( + session: orm.Session, + patient: models.Patient, + commit: bool = True, +) -> list[models.BlockingValue]: + """ + Inserts blocking keys for a patient record into the MPI database. + """ + values: list[models.BlockingValue] = [] + # Iterate over all the Blocking Keys + for key in models.BlockingKey: + # For each Key, get all the values from the data dictionary + # Many Keys will only have 1 value, but its possible that + # a PII data dict could have multiple given names + for val in key.to_value(patient.data): + values.append( + models.BlockingValue( + patient_id=patient.id, blockingkey=key.id, value=val + ) + ) + session.add_all(values) + + if commit: + session.commit() + return values From 2cfdc420c9a572c0d6512e57750d5ce3e21a3764 Mon Sep 17 00:00:00 2001 From: Eric Buckley Date: Wed, 18 Sep 2024 12:06:45 -0700 Subject: [PATCH 03/19] pinning pyway to 0.3.29 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 152434d7..268fdf57 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,7 @@ dependencies = [ "fhirpathpy", "rapidfuzz", # TODO: replace pyway with alembic for managing database migrations - "pyway" + "pyway==0.3.29" ] [project.optional-dependencies] From 2f9727a30d037fb2ad807a3c89c4c3844b17b15b Mon Sep 17 00:00:00 2001 From: Eric Buckley Date: Wed, 18 Sep 2024 13:42:14 -0700 Subject: [PATCH 04/19] adding test cases for BlockingKey --- pyproject.toml | 1 + src/recordlinker/linkage/models.py | 75 +++++++++++++++++++------- tests/unit/test_models.py | 87 ++++++++++++++++++++++++++++++ 3 files changed, 144 insertions(+), 19 deletions(-) create mode 100644 tests/unit/test_models.py diff --git a/pyproject.toml b/pyproject.toml index 268fdf57..66fbbd14 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,6 +21,7 @@ dependencies = [ "fastapi", "pydantic", "pydantic-settings", + "python-dateutil==2.9.0", "sqlalchemy", "fhirpathpy", "rapidfuzz", diff --git a/src/recordlinker/linkage/models.py b/src/recordlinker/linkage/models.py index 98c42b51..369df5cc 100644 --- a/src/recordlinker/linkage/models.py +++ b/src/recordlinker/linkage/models.py @@ -1,7 +1,8 @@ +import datetime import enum -import typing import uuid +import dateutil.parser import dotenv from sqlalchemy import create_engine from sqlalchemy import orm @@ -59,7 +60,7 @@ class BlockingKey(enum.Enum): the algorithm. By defining them all upfront, we give the user flexibility in adjusting their algorithm configuration without having to reload the data. - **HERE BE DRAGONS**: IN A REAL SYSTEM, THESE ENUMS SHOULD NOT BE CHANGED!!! + **HERE BE DRAGONS**: IN A PRODUCTION SYSTEM, THESE ENUMS SHOULD NOT BE CHANGED!!! """ BIRTHDATE = 1, "Date of Birth" MRN = 2, "Last 4 chars of MRN" @@ -72,34 +73,70 @@ def __init__(self, id: int, description: str): self.id = id self.description = description - def to_value(self, data: dict) -> typing.Generator[str, None, None]: + def to_value(self, data: dict) -> list[str]: """ Given a data dictionary of Patient PII data, return a generator of all possible values for this Key. Many Keys will only have 1 possible value, but some (like first name) could have multiple values. """ if self == BlockingKey.BIRTHDATE: - if "birthdate" in data: - yield data["birthdate"].isoformat() + return self._extract_birthdate(data) if self == BlockingKey.MRN: - if "mrn" in data: - yield data["mrn"][-4:] + return self._extract_mrn_last_four(data) if self == BlockingKey.SEX: - if "sex" in data: - # TODO: Normalize this - yield data["sex"] + return self._extract_sex(data) if self == BlockingKey.ZIP: - for address in data.get("address", []): - if "zip" in address: - yield address["zip"] + return self._extract_zipcode(data) if self == BlockingKey.FIRST_NAME: - for name in data.get("name", []): - for given in name.get("given", []): - yield given[:4] + return self._extract_first_name_first_four(data) if self == BlockingKey.LAST_NAME: - for name in data.get("name", []): - if "family" in name: - yield name.get("family")[:4] + return self._extract_last_name_first_four(data) + return [] + + def _extract_birthdate(self, data: dict) -> list[str]: + if "birthdate" in data: + val = data["birthdate"] + if not isinstance(val, (datetime.date, datetime.datetime)): + # if not an instance of date or datetime, try to parse it + val = dateutil.parser.parse(str(val)) + return [val.strftime("%Y-%m-%d")] + return [] + + def _extract_mrn_last_four(self, data: dict) -> list[str]: + if "mrn" in data: + return [data["mrn"].strip()[-4:]] + return [] + + def _extract_sex(self, data: dict) -> list[str]: + if "sex" in data: + val = str(data["sex"]).lower().strip() + if val in ["m", "male"]: + return ["m"] + elif val in ["f", "female"]: + return ["f"] + return ["u"] + return [] + + def _extract_zipcode(self, data: dict) -> list[str]: + zipcodes = [] + for address in data.get("address", []): + if "zip" in address: + zipcodes.append(address["zip"].strip()[0:5]) + return zipcodes + + def _extract_first_name_first_four(self, data: dict) -> list[str]: + names = [] + for name in data.get("name", []): + for given in name.get("given", []): + names.append(given[:4]) + return names + + def _extract_last_name_first_four(self, data: dict) -> list[str]: + names = [] + for name in data.get("name", []): + if "family" in name: + names.append(name.get("family")[:4]) + return names class BlockingValue(Base): diff --git a/tests/unit/test_models.py b/tests/unit/test_models.py new file mode 100644 index 00000000..a8713c78 --- /dev/null +++ b/tests/unit/test_models.py @@ -0,0 +1,87 @@ +""" +unit.test_models.py +~~~~~~~~~~~~~~~~~~~ + +This module contains the unit tests for the recordlinker.linkage.models module. +""" + +import datetime +import unittest + +from recordlinker.linkage import models + + +class BlockingKeyTests(unittest.TestCase): + def test_extract_birthdate(self): + data = {"dob": "01/01/1980"} + self.assertEqual( + models.BlockingKey.BIRTHDATE.to_value(data), [] + ) + data = {"birthdate": "01/01/1980"} + self.assertEqual( + models.BlockingKey.BIRTHDATE.to_value(data), ["1980-01-01"] + ) + data = {"birthdate": datetime.date(1980, 1, 1)} + self.assertEqual( + models.BlockingKey.BIRTHDATE.to_value(data), ["1980-01-01"] + ) + data = {"birthdate": datetime.datetime(1980, 1, 1, 12, 30, 0)} + self.assertEqual( + models.BlockingKey.BIRTHDATE.to_value(data), ["1980-01-01"] + ) + + def test_extract_mrn_last_four(self): + data = {"ssn": "123456789"} + self.assertEqual(models.BlockingKey.MRN.to_value(data), []) + data = {"mrn": "123456789"} + self.assertEqual(models.BlockingKey.MRN.to_value(data), ["6789"]) + data = {"mrn": "89"} + self.assertEqual(models.BlockingKey.MRN.to_value(data), ["89"]) + + def test_extract_sex(self): + data = {"gender": "M"} + self.assertEqual(models.BlockingKey.SEX.to_value(data), []) + data = {"sex": "M"} + self.assertEqual(models.BlockingKey.SEX.to_value(data), ["m"]) + data = {"sex": "Male"} + self.assertEqual(models.BlockingKey.SEX.to_value(data), ["m"]) + data = {"sex": "f"} + self.assertEqual(models.BlockingKey.SEX.to_value(data), ["f"]) + data = {"sex": "FEMALE"} + self.assertEqual(models.BlockingKey.SEX.to_value(data), ["f"]) + data = {"sex": "other"} + self.assertEqual(models.BlockingKey.SEX.to_value(data), ["u"]) + data = {"sex": "unknown"} + self.assertEqual(models.BlockingKey.SEX.to_value(data), ["u"]) + data = {"sex": "?"} + self.assertEqual(models.BlockingKey.SEX.to_value(data), ["u"]) + + def test_extract_zipcode(self): + data = {"zip": "12345"} + self.assertEqual(models.BlockingKey.ZIP.to_value(data), []) + data = {"address": [{"zip": "12345"}]} + self.assertEqual(models.BlockingKey.ZIP.to_value(data), ["12345"]) + data = {"address": [{"zip": "12345-6789"}]} + self.assertEqual(models.BlockingKey.ZIP.to_value(data), ["12345"]) + data = {"address": [{"zip": "12345-6789"}, {"zip": "54321"}]} + self.assertEqual(models.BlockingKey.ZIP.to_value(data), ["12345", "54321"]) + + def test_extract_first_name_first_four(self): + data = {"first_name": "John"} + self.assertEqual(models.BlockingKey.FIRST_NAME.to_value(data), []) + data = {"name": [{"given": ["John", "Jane"]}]} + self.assertEqual( + models.BlockingKey.FIRST_NAME.to_value(data), ["John", "Jane"] + ) + data = {"name": [{"given": ["Janet", "Johnathon"]}]} + self.assertEqual( + models.BlockingKey.FIRST_NAME.to_value(data), ["Jane", "John"] + ) + + def test_extract_last_name_first_four(self): + data = {"last_name": "Doe"} + self.assertEqual(models.BlockingKey.LAST_NAME.to_value(data), []) + data = {"name": [{"family": "Doe"}]} + self.assertEqual(models.BlockingKey.LAST_NAME.to_value(data), ["Doe"]) + data = {"name": [{"family": "Smith"}, {"family": "Doe"}]} + self.assertEqual(models.BlockingKey.LAST_NAME.to_value(data), ["Smit", "Doe"]) From 1a70c8456fc77fdccfd854562b5a245612d45c3f Mon Sep 17 00:00:00 2001 From: Eric Buckley Date: Wed, 18 Sep 2024 21:52:02 -0700 Subject: [PATCH 05/19] moving external person into patient table --- ...9_external_person_id_should_be_nullable.py | 42 ++++++++++++++++++ ..._moving_externalperson_to_patient_table.py | 43 +++++++++++++++++++ src/recordlinker/linkage/models.py | 32 +++++++------- 3 files changed, 101 insertions(+), 16 deletions(-) create mode 100644 alembic/versions/64ed9566f189_external_person_id_should_be_nullable.py create mode 100644 alembic/versions/bfbd015ca466_moving_externalperson_to_patient_table.py diff --git a/alembic/versions/64ed9566f189_external_person_id_should_be_nullable.py b/alembic/versions/64ed9566f189_external_person_id_should_be_nullable.py new file mode 100644 index 00000000..26fe53b7 --- /dev/null +++ b/alembic/versions/64ed9566f189_external_person_id_should_be_nullable.py @@ -0,0 +1,42 @@ +"""external person id should be nullable + +Revision ID: 64ed9566f189 +Revises: bfbd015ca466 +Create Date: 2024-09-18 20:22:07.510203 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '64ed9566f189' +down_revision: Union[str, None] = 'bfbd015ca466' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.drop_index('ix_mpi_blocking_value_value', table_name='mpi_blocking_value') + op.alter_column('mpi_patient', 'external_person_id', + existing_type=sa.VARCHAR(length=255), + nullable=True) + op.alter_column('mpi_patient', 'external_person_source', + existing_type=sa.VARCHAR(length=100), + nullable=True) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.alter_column('mpi_patient', 'external_person_source', + existing_type=sa.VARCHAR(length=100), + nullable=False) + op.alter_column('mpi_patient', 'external_person_id', + existing_type=sa.VARCHAR(length=255), + nullable=False) + op.create_index('ix_mpi_blocking_value_value', 'mpi_blocking_value', ['value'], unique=False) + # ### end Alembic commands ### diff --git a/alembic/versions/bfbd015ca466_moving_externalperson_to_patient_table.py b/alembic/versions/bfbd015ca466_moving_externalperson_to_patient_table.py new file mode 100644 index 00000000..800e44b3 --- /dev/null +++ b/alembic/versions/bfbd015ca466_moving_externalperson_to_patient_table.py @@ -0,0 +1,43 @@ +"""moving ExternalPerson to Patient table + +Revision ID: bfbd015ca466 +Revises: ad18f1d41fad +Create Date: 2024-09-18 20:10:30.193941 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = 'bfbd015ca466' +down_revision: Union[str, None] = 'ad18f1d41fad' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.drop_table('mpi_external_person') + op.create_index(op.f('ix_mpi_blocking_value_value'), 'mpi_blocking_value', ['value'], unique=False) + op.add_column('mpi_patient', sa.Column('external_person_id', sa.String(length=255), nullable=False)) + op.add_column('mpi_patient', sa.Column('external_person_source', sa.String(length=100), nullable=False)) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.drop_column('mpi_patient', 'external_person_source') + op.drop_column('mpi_patient', 'external_person_id') + op.drop_index(op.f('ix_mpi_blocking_value_value'), table_name='mpi_blocking_value') + op.create_table('mpi_external_person', + sa.Column('id', sa.INTEGER(), nullable=False), + sa.Column('person_id', sa.INTEGER(), nullable=False), + sa.Column('external_id', sa.VARCHAR(length=255), nullable=False), + sa.Column('source', sa.VARCHAR(length=255), nullable=False), + sa.ForeignKeyConstraint(['person_id'], ['mpi_person.id'], ), + sa.PrimaryKeyConstraint('id') + ) + # ### end Alembic commands ### diff --git a/src/recordlinker/linkage/models.py b/src/recordlinker/linkage/models.py index 369df5cc..877edadd 100644 --- a/src/recordlinker/linkage/models.py +++ b/src/recordlinker/linkage/models.py @@ -31,15 +31,7 @@ class Person(Base): id: orm.Mapped[int] = orm.mapped_column(primary_key=True) internal_id: orm.Mapped[uuid.UUID] = orm.mapped_column(default=uuid.uuid4) - - -class ExternalPerson(Base): - __tablename__ = "mpi_external_person" - - id: orm.Mapped[int] = orm.mapped_column(primary_key=True) - person_id: orm.Mapped[int] = orm.mapped_column(schema.ForeignKey("mpi_person.id")) - external_id: orm.Mapped[str] = orm.mapped_column(sqltypes.String(255)) - source: orm.Mapped[str] = orm.mapped_column(sqltypes.String(255)) + patients: orm.Mapped[list["Patient"]] = orm.relationship(back_populates="person") class Patient(Base): @@ -47,7 +39,11 @@ class Patient(Base): id: orm.Mapped[int] = orm.mapped_column(primary_key=True) person_id: orm.Mapped[int] = orm.mapped_column(schema.ForeignKey("mpi_person.id")) + person: orm.Mapped["Person"] = orm.relationship(back_populates="patients") data: orm.Mapped[dict] = orm.mapped_column(sqltypes.JSON) + external_person_id: orm.Mapped[str] = orm.mapped_column(sqltypes.String(255), nullable=True) + external_person_source: orm.Mapped[str] = orm.mapped_column(sqltypes.String(100), nullable=True) + blocking_values: orm.Mapped[list["BlockingValue"]] = orm.relationship(back_populates="patient") class BlockingKey(enum.Enum): @@ -120,22 +116,25 @@ def _extract_sex(self, data: dict) -> list[str]: def _extract_zipcode(self, data: dict) -> list[str]: zipcodes = [] for address in data.get("address", []): - if "zip" in address: - zipcodes.append(address["zip"].strip()[0:5]) + if isinstance(address, dict): + if "zip" in address: + zipcodes.append(str(address["zip"]).strip()[0:5]) return zipcodes def _extract_first_name_first_four(self, data: dict) -> list[str]: names = [] for name in data.get("name", []): - for given in name.get("given", []): - names.append(given[:4]) + if isinstance(name, dict): + for given in name.get("given", []): + names.append(str(given)[:4]) return names def _extract_last_name_first_four(self, data: dict) -> list[str]: names = [] for name in data.get("name", []): - if "family" in name: - names.append(name.get("family")[:4]) + if isinstance(name, dict): + if "family" in name: + names.append(str(name["family"])[:4]) return names @@ -150,5 +149,6 @@ class BlockingValue(Base): id: orm.Mapped[int] = orm.mapped_column(primary_key=True) patient_id: orm.Mapped[int] = orm.mapped_column(schema.ForeignKey("mpi_patient.id")) + patient: orm.Mapped["Patient"] = orm.relationship(back_populates="blocking_values") blockingkey: orm.Mapped[int] = orm.mapped_column(sqltypes.Integer) - value: orm.Mapped[str] = orm.mapped_column(sqltypes.String(50), index=True) + value: orm.Mapped[str] = orm.mapped_column(sqltypes.String(50)) From 84722f2cd8bc619eeee9c340ec983b286f12e424 Mon Sep 17 00:00:00 2001 From: Eric Buckley Date: Wed, 18 Sep 2024 21:52:19 -0700 Subject: [PATCH 06/19] ignore public docstrings in tests --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 66fbbd14..496edbcc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -66,6 +66,7 @@ select = ["E4", "E7", "E9", "F", "I", "D102", "D103", "D104", "D105", "D106"] # [tool.ruff.lint.per-file-ignores] "**/__init__.py" = ["D"] +"tests/*.py" = ["D102", "D103"] # Ignore the public docstring rules in test files [tool.ruff.lint.isort] # The following settings reduce the number of changes from reorder-python-imports From 9dc1a26e9c88d54e0945092f98521734d650c11d Mon Sep 17 00:00:00 2001 From: Eric Buckley Date: Wed, 18 Sep 2024 21:57:32 -0700 Subject: [PATCH 07/19] ignoring .DS_Store --- .gitignore | 3 ++- assets/.DS_Store | Bin 6148 -> 0 bytes src/.DS_Store | Bin 6148 -> 0 bytes src/recordlinker/.DS_Store | Bin 6148 -> 0 bytes 4 files changed, 2 insertions(+), 1 deletion(-) delete mode 100644 assets/.DS_Store delete mode 100644 src/.DS_Store delete mode 100644 src/recordlinker/.DS_Store diff --git a/.gitignore b/.gitignore index ce0064f4..1ed11d31 100644 --- a/.gitignore +++ b/.gitignore @@ -27,7 +27,8 @@ temp/ *.tmp # macOS -.DS_Store +# ignore all files ending in .DS_Store +**/.DS_Store # Python-Virtual Environments .venv diff --git a/assets/.DS_Store b/assets/.DS_Store deleted file mode 100644 index bfe8c911e5bd2016b197e70bbc447a40b53047ee..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHKOHM;E478zAE3xUa%UppQL{&Kf7vNKb5KmMBv45+ct4CwTwh+~8*|9-m$(h9Q zB+4wFpUuqIpWAD*rI{_@MB8(i8u#fFyQm1!<#{d zzZ*!H+h8x)dd0y=%A<4Z^m&R(0VyB_q<|EV0>3MOnoV0=C2EucQa}oPE5PqVgA;q< zm>8c94ABArX9$O39=!yxF#zm^VLuSx_R7;bLwWV4#nek z#9O4B_e70SKnffyaGA>`>;Db>O#gpO(n<$EObYz90$=OQ8?gWY diff --git a/src/.DS_Store b/src/.DS_Store deleted file mode 100644 index 106db7e4e86d5734173d8a99ec0e18b67c4d40b0..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHKUrPc(5T8}c6A64MsJDe)h14X8UJkP_&_z8|*j*=eblr-l-Gd1CUSFs$(&uSr zca73|3L<0%W`67Z*>ip;yAA+|?j&pi)BwOiB`j3Xd?OSmU6O+J6cL5qqYop9gDf2e z@lrNB{v!kQ?#d8B2pRbB>HVnyBd@_Qj?!_X@g@qz(%O32DLYl?+PhboH}=PqwCj&9 z>C>4~aWG%|!Fe>u`nAntm5lu;8H{v66b>-u>LN-)m37r52~%C`nSfJqD*f8dblPln z4(qbhZqDj*de~~zWqbc%Hmf*WyL(5c{fGE5QO|};fj>Jf8y4sAf{TT=!t^!{@M}`Z zLg>jGC9z8GFf!k8z7Zoczzi@0tH^*o>zwK;Hpe|N1I)m$7@+e(q7u3mbA$TmK%++h z#1gu-;F@L$s*x65i@8CJpa@fnXi9}0F@z~czqEO-#oVAN2VsW~VV^AQgd+6Qaet}P zLAVClG6T%OR|aVIL#fgIfAaJEuanp?1I)l`G9XGluh+%i?A^N5CEc|W^#+xM@^XV8 lCAiR6G3L@$yp5^_{gM`luEpFSdQkX}fTn>BGw`PjyaP7QQuqJ> diff --git a/src/recordlinker/.DS_Store b/src/recordlinker/.DS_Store deleted file mode 100644 index 19ba45aac82c3fb1cd5b86fd5d8d87f14b039aee..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHK%Sr=55UkdKfn0LTael!+7()Dl{D7K>h>JOh_dWStewx(}#IVUJc#&%8u9@Dh z9i|T3+W>6+vA+eD0OoW@e0rFgKX;$lRb`Aw=Q|$ojHmtkZg@OQs!u1JdxHTlc*E;& z{$V&xP*w^^0VyB_q<|E-NCEG?wD}@YQ3^-_De$9!e;*p%u@{bs@#$cQ5r8;jI*jX> zC5X)v#9lZiGDEYZ5|e5*Vp!6dZ&lX|$Hb(=YWT1^*=j Date: Wed, 18 Sep 2024 21:58:41 -0700 Subject: [PATCH 08/19] changing tests to pytest style --- tests/unit/test_models.py | 65 ++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 39 deletions(-) diff --git a/tests/unit/test_models.py b/tests/unit/test_models.py index a8713c78..c3abb76f 100644 --- a/tests/unit/test_models.py +++ b/tests/unit/test_models.py @@ -6,82 +6,69 @@ """ import datetime -import unittest from recordlinker.linkage import models -class BlockingKeyTests(unittest.TestCase): +class TestBlockingKey: def test_extract_birthdate(self): data = {"dob": "01/01/1980"} - self.assertEqual( - models.BlockingKey.BIRTHDATE.to_value(data), [] - ) + assert models.BlockingKey.BIRTHDATE.to_value(data) == [] data = {"birthdate": "01/01/1980"} - self.assertEqual( - models.BlockingKey.BIRTHDATE.to_value(data), ["1980-01-01"] - ) + assert models.BlockingKey.BIRTHDATE.to_value(data) == ["1980-01-01"] data = {"birthdate": datetime.date(1980, 1, 1)} - self.assertEqual( - models.BlockingKey.BIRTHDATE.to_value(data), ["1980-01-01"] - ) + assert models.BlockingKey.BIRTHDATE.to_value(data) == ["1980-01-01"] data = {"birthdate": datetime.datetime(1980, 1, 1, 12, 30, 0)} - self.assertEqual( - models.BlockingKey.BIRTHDATE.to_value(data), ["1980-01-01"] - ) + assert models.BlockingKey.BIRTHDATE.to_value(data) == ["1980-01-01"] def test_extract_mrn_last_four(self): data = {"ssn": "123456789"} - self.assertEqual(models.BlockingKey.MRN.to_value(data), []) + assert models.BlockingKey.MRN.to_value(data) == [] data = {"mrn": "123456789"} - self.assertEqual(models.BlockingKey.MRN.to_value(data), ["6789"]) + assert models.BlockingKey.MRN.to_value(data) == ["6789"] data = {"mrn": "89"} - self.assertEqual(models.BlockingKey.MRN.to_value(data), ["89"]) + assert models.BlockingKey.MRN.to_value(data) == ["89"] def test_extract_sex(self): data = {"gender": "M"} - self.assertEqual(models.BlockingKey.SEX.to_value(data), []) + assert models.BlockingKey.SEX.to_value(data) == [] data = {"sex": "M"} - self.assertEqual(models.BlockingKey.SEX.to_value(data), ["m"]) + assert models.BlockingKey.SEX.to_value(data) == ["m"] data = {"sex": "Male"} - self.assertEqual(models.BlockingKey.SEX.to_value(data), ["m"]) + assert models.BlockingKey.SEX.to_value(data) == ["m"] data = {"sex": "f"} - self.assertEqual(models.BlockingKey.SEX.to_value(data), ["f"]) + assert models.BlockingKey.SEX.to_value(data) == ["f"] data = {"sex": "FEMALE"} - self.assertEqual(models.BlockingKey.SEX.to_value(data), ["f"]) + assert models.BlockingKey.SEX.to_value(data) == ["f"] data = {"sex": "other"} - self.assertEqual(models.BlockingKey.SEX.to_value(data), ["u"]) + assert models.BlockingKey.SEX.to_value(data) == ["u"] data = {"sex": "unknown"} - self.assertEqual(models.BlockingKey.SEX.to_value(data), ["u"]) + assert models.BlockingKey.SEX.to_value(data) == ["u"] data = {"sex": "?"} - self.assertEqual(models.BlockingKey.SEX.to_value(data), ["u"]) + assert models.BlockingKey.SEX.to_value(data) == ["u"] def test_extract_zipcode(self): data = {"zip": "12345"} - self.assertEqual(models.BlockingKey.ZIP.to_value(data), []) + assert models.BlockingKey.ZIP.to_value(data) == [] data = {"address": [{"zip": "12345"}]} - self.assertEqual(models.BlockingKey.ZIP.to_value(data), ["12345"]) + assert models.BlockingKey.ZIP.to_value(data) == ["12345"] data = {"address": [{"zip": "12345-6789"}]} - self.assertEqual(models.BlockingKey.ZIP.to_value(data), ["12345"]) + assert models.BlockingKey.ZIP.to_value(data) == ["12345"] data = {"address": [{"zip": "12345-6789"}, {"zip": "54321"}]} - self.assertEqual(models.BlockingKey.ZIP.to_value(data), ["12345", "54321"]) + assert models.BlockingKey.ZIP.to_value(data) == ["12345", "54321"] def test_extract_first_name_first_four(self): data = {"first_name": "John"} - self.assertEqual(models.BlockingKey.FIRST_NAME.to_value(data), []) + assert models.BlockingKey.FIRST_NAME.to_value(data) == [] data = {"name": [{"given": ["John", "Jane"]}]} - self.assertEqual( - models.BlockingKey.FIRST_NAME.to_value(data), ["John", "Jane"] - ) + assert models.BlockingKey.FIRST_NAME.to_value(data) == ["John", "Jane"] data = {"name": [{"given": ["Janet", "Johnathon"]}]} - self.assertEqual( - models.BlockingKey.FIRST_NAME.to_value(data), ["Jane", "John"] - ) + assert models.BlockingKey.FIRST_NAME.to_value(data) == ["Jane", "John"] def test_extract_last_name_first_four(self): data = {"last_name": "Doe"} - self.assertEqual(models.BlockingKey.LAST_NAME.to_value(data), []) + assert models.BlockingKey.LAST_NAME.to_value(data) == [] data = {"name": [{"family": "Doe"}]} - self.assertEqual(models.BlockingKey.LAST_NAME.to_value(data), ["Doe"]) + assert models.BlockingKey.LAST_NAME.to_value(data) == ["Doe"] data = {"name": [{"family": "Smith"}, {"family": "Doe"}]} - self.assertEqual(models.BlockingKey.LAST_NAME.to_value(data), ["Smit", "Doe"]) + assert models.BlockingKey.LAST_NAME.to_value(data) == ["Smit", "Doe"] From 594cba2551e4f822a688eb1f69f706bee78e1bf5 Mon Sep 17 00:00:00 2001 From: Eric Buckley Date: Wed, 18 Sep 2024 21:59:25 -0700 Subject: [PATCH 09/19] adding TEST_DB_URI for short-term --- .env | 1 + src/recordlinker/config.py | 3 +++ 2 files changed, 4 insertions(+) diff --git a/.env b/.env index a10ca2d5..c50cc176 100644 --- a/.env +++ b/.env @@ -1 +1,2 @@ DB_URI="postgresql+psycopg2://postgres:pw@localhost:5432/postgres" +TEST_DB_URI="sqlite:///:memory:" diff --git a/src/recordlinker/config.py b/src/recordlinker/config.py index 6fe2e401..98df10d1 100644 --- a/src/recordlinker/config.py +++ b/src/recordlinker/config.py @@ -10,6 +10,9 @@ class Settings(pydantic_settings.BaseSettings): ) db_uri: str = pydantic.Field(description="The URI for the MPI database") + # FIXME: A separate URI for testing is temporary, this is necessary right now because + # the old schema only works with postgresql. Once the old schema is removed we can consolidate + test_db_uri: str = pydantic.Field(description="The URI for the MPI database to run tests against") connection_pool_size: typing.Optional[int] = pydantic.Field( description="The number of MPI database connections in the connection pool", default=5, From 22c92dfffeedba4ba7dd75d8bca9d2a2b13c4fed Mon Sep 17 00:00:00 2001 From: Eric Buckley Date: Wed, 18 Sep 2024 21:59:49 -0700 Subject: [PATCH 10/19] tests for simple mpi --- src/recordlinker/linkage/simple_mpi.py | 49 ++++----- tests/unit/test_simple_mpi.py | 136 +++++++++++++++++++++++++ 2 files changed, 158 insertions(+), 27 deletions(-) create mode 100644 tests/unit/test_simple_mpi.py diff --git a/src/recordlinker/linkage/simple_mpi.py b/src/recordlinker/linkage/simple_mpi.py index 9cc740eb..d99b2c2b 100644 --- a/src/recordlinker/linkage/simple_mpi.py +++ b/src/recordlinker/linkage/simple_mpi.py @@ -23,58 +23,55 @@ def get_block_data( # Create the base query query = session.query(models.Patient) - # Build the join criteria - for block in algo_config["blocks"]: + # Build the join criteria, we are joining the Blocking Value table + # multiple times, once for each Blocking Key. If a Patient record + # has a matching Blocking Value for all the Blocking Keys, then it + # is considered a match. + for idx, block in enumerate(algo_config["blocks"]): key_name = block["value"].upper() # Get the matching Blocking Key based on the value in the algo_config assert hasattr(models.BlockingKey, key_name), f"Invalid Blocking Key: {block}" key = models.BlockingKey[key_name] # Get all the possible values from the data for this key vals = [v for v in key.to_value(data)] + # Create a dynamic alias for the Blocking Value table using the index + # this is necessary since we are potentially joining the same table + # multiple times with different conditions + alias = orm.aliased(models.BlockingValue, name=f"bv{idx}") # Add a join clause to the mpi_blocking_value table for each Blocking Key. # This results in multiple joins to the same table, one for each Key, but # with different joining conditions. query = query.join( - models.BlockingValue, + alias, expression.and_( - models.Patient.id == models.BlockingValue.patient_id, - models.BlockingValue.blockingkey == key.id, - models.BlockingValue.value.in_(vals), + models.Patient.id == alias.patient_id, + alias.blockingkey == key.id, + alias.value.in_(vals), ), ) - return query.all() +# TODO: should this method be renamed to insert_patient def insert_matched_patient( session: orm.Session, data: dict, - person_id: typing.Optional[int], - external_person_id: typing.Optional[str], + person_id: typing.Optional[int] = None, + external_person_id: typing.Optional[str] = None, commit: bool = True, ) -> models.Patient: """ Insert a new patient record into the database. """ - if person_id is None: - # create a new Person record - person = models.Person() - session.add(person) - # flush the session to get the person_id - session.flush() - person_id = person.id + # create a new Person record if one isn't provided + person = models.Person() if not person_id else session.get(models.Person, person_id) + patient = models.Patient(person=person, data=data) if external_person_id is not None: - # create a new ExternalPerson record if it doesn't exist - external_person = models.ExternalPerson( - person_id=person_id, - external_id=external_person_id, - source="IRIS", - ) - session.merge(external_person) + patient.external_person_id = external_person_id + patient.external_person_source = "IRIS" # create a new Patient record - patient = models.Patient(person_id=person_id, data=data) session.add(patient) # insert blocking keys @@ -101,9 +98,7 @@ def insert_blocking_keys( # a PII data dict could have multiple given names for val in key.to_value(patient.data): values.append( - models.BlockingValue( - patient_id=patient.id, blockingkey=key.id, value=val - ) + models.BlockingValue(patient=patient, blockingkey=key.id, value=val) ) session.add_all(values) diff --git a/tests/unit/test_simple_mpi.py b/tests/unit/test_simple_mpi.py new file mode 100644 index 00000000..f2086b9f --- /dev/null +++ b/tests/unit/test_simple_mpi.py @@ -0,0 +1,136 @@ +""" +unit.test_simple_mpi.py +~~~~~~~~~~~~~~~~~~~~~~~ + +This module contains the unit tests for the recordlinker.linkage.simple_mpi module. +""" + +import pytest +from sqlalchemy import create_engine +from sqlalchemy.orm import scoped_session +from sqlalchemy.orm import sessionmaker + +from recordlinker.config import settings +from recordlinker.linkage import models +from recordlinker.linkage import simple_mpi + + +# Fixture to create a new in-memory SQLite database and session for each test +@pytest.fixture(scope="function") +def session(): + # Create an in-memory SQLite engine + engine = create_engine(settings.test_db_uri) + models.Base.metadata.create_all(engine) # Create all tables in the in-memory database + + # Create a new session factory and scoped session + Session = scoped_session(sessionmaker(bind=engine)) + session = Session() + + yield session # This is where the testing happens + + session.close() # Cleanup after test + models.Base.metadata.drop_all(engine) # Drop all tables after the test + + +@pytest.fixture(scope="function") +def new_patient(session): + patient = models.Patient(person=models.Person(), data={}) + session.add(patient) + session.flush() + return patient + + +class TestInsertBlockingKeys: + def test_patient_no_blocking_keys(self, session, new_patient): + new_patient.data = {"name": "John Doe"} + assert simple_mpi.insert_blocking_keys(session, new_patient) == [] + + def test_patient_with_blocking_keys(self, session, new_patient): + new_patient.data = {"name": [{"given": ["Johnathon", "Bill",], "family": "Smith"}], "birthdate": "01/01/1980"} + keys = simple_mpi.insert_blocking_keys(session, new_patient) + assert len(keys) == 4 + assert keys[0].patient_id == new_patient.id + assert keys[0].blockingkey == models.BlockingKey.BIRTHDATE.id + assert keys[0].value == "1980-01-01" + assert keys[1].patient_id == new_patient.id + assert keys[1].blockingkey == models.BlockingKey.FIRST_NAME.id + assert keys[1].value == "John" + assert keys[2].patient_id == new_patient.id + assert keys[2].blockingkey == models.BlockingKey.FIRST_NAME.id + assert keys[2].value == "Bill" + assert keys[3].patient_id == new_patient.id + assert keys[3].blockingkey == models.BlockingKey.LAST_NAME.id + assert keys[3].value == "Smit" + + +class TestInsertMatchedPatient: + def test_no_person(self, session): + data = {"name": [{"given": ["Johnathon", "Bill",], "family": "Smith"}], "birthdate": "01/01/1980"} + patient = simple_mpi.insert_matched_patient(session, data) + assert patient.person_id is not None + assert patient.data == data + assert patient.external_person_id is None + assert patient.external_person_source is None + assert patient.person.internal_id is not None + assert patient.person.id == patient.person_id + assert len(patient.blocking_values) == 4 + + def test_no_person_with_external_id(self, session): + data = {"name": [{"given": ["Johnathon",], "family": "Smith"}], "birthdate": "01/01/1980"} + patient = simple_mpi.insert_matched_patient(session, data, external_person_id="123456") + assert patient.person_id is not None + assert patient.data == data + assert patient.external_person_id == "123456" + assert patient.external_person_source == "IRIS" + assert patient.person.internal_id is not None + assert patient.person.id is not None + assert patient.person.id == patient.person_id + assert len(patient.blocking_values) == 3 + + def test_with_person(self, session): + person = models.Person() + session.add(person) + session.flush() + data = {"name": [{"given": ["George",], "family": "Harrison"}], "birthdate": "1943-2-25"} + patient = simple_mpi.insert_matched_patient(session, data, person_id=person.id) + assert patient.person_id == person.id + assert patient.data == data + assert patient.external_person_id is None + assert patient.external_person_source is None + assert len(patient.blocking_values) == 3 + + +class TestGetBlockData: + @pytest.fixture + def prime_index(self, session): + data = [ + {"name": [{"given": ["Johnathon", "Bill",], "family": "Smith"}], "birthdate": "01/01/1980"}, + {"name": [{"given": ["George",], "family": "Harrison"}], "birthdate": "1943-2-25"}, + {"name": [{"given": ["John",], "family": "Doe"}], "birthdate": "1980-01-01"}, + {"name": [{"given": ["Bill",], "family": "Smith"}], "birthdate": "1980-01-01"}, + {"name": [{"given": ["John",], "family": "Smith"}], "birthdate": "1980-01-01"}, + {"name": [{"given": ["John",], "family": "Smith"}], "birthdate": "1985-11-12"}, + ] + for datum in data: + simple_mpi.insert_matched_patient(session, datum) + + def test_match_on_birthdate(self, session, prime_index): + data = {"name": [{"given": ["Johnathon", "Bill",], "family": "Smith"}], "birthdate": "01/01/1980"} + algo_config = {"blocks": [{"value": "birthdate"}]} + matches = simple_mpi.get_block_data(session, data, algo_config) + assert len(matches) == 4 + data = {"name": [{"given": ["Johnathon", "Bill",], "family": "Smith"}], "birthdate": "11/12/1985"} + matches = simple_mpi.get_block_data(session, data, algo_config) + assert len(matches) == 1 + + def test_match_on_first_name(self, session, prime_index): + data = {"name": [{"given": ["Johnathon", "Bill",], "family": "Smith"}], "birthdate": "01/01/1980"} + algo_config = {"blocks": [{"value": "first_name"}]} + matches = simple_mpi.get_block_data(session, data, algo_config) + assert len(matches) == 5 + + def test_match_on_birthdate_and_first_name(self, session, prime_index): + data = {"name": [{"given": ["Johnathon", "Bill",], "family": "Smith"}], "birthdate": "01/01/1980"} + algo_config = {"blocks": [{"value": "birthdate"}, {"value": "first_name"}]} + matches = simple_mpi.get_block_data(session, data, algo_config) + assert len(matches) == 4 From eb5511fe82c83a24aa70bd35d249998780031afc Mon Sep 17 00:00:00 2001 From: Eric Buckley Date: Thu, 19 Sep 2024 08:44:10 -0700 Subject: [PATCH 11/19] adding another test for get_block_data --- tests/unit/test_simple_mpi.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/unit/test_simple_mpi.py b/tests/unit/test_simple_mpi.py index f2086b9f..829c773d 100644 --- a/tests/unit/test_simple_mpi.py +++ b/tests/unit/test_simple_mpi.py @@ -134,3 +134,15 @@ def test_match_on_birthdate_and_first_name(self, session, prime_index): algo_config = {"blocks": [{"value": "birthdate"}, {"value": "first_name"}]} matches = simple_mpi.get_block_data(session, data, algo_config) assert len(matches) == 4 + + def test_match_on_birthdate_first_name_and_last_name(self, session, prime_index): + data = {"name": [{"given": ["Johnathon", "Bill",], "family": "Smith"}], "birthdate": "01/01/1980"} + algo_config = {"blocks": [{"value": "birthdate"}, {"value": "first_name"}, {"value": "last_name"}]} + matches = simple_mpi.get_block_data(session, data, algo_config) + assert len(matches) == 3 + data = {"name": [{"given": ["Billy",], "family": "Smitty"}], "birthdate": "Jan 1 1980"} + matches = simple_mpi.get_block_data(session, data, algo_config) + assert len(matches) == 2 + data = {"name": [{"given": ["Johnathon", "Bill",], "family": "Doeherty"}], "birthdate": "Jan 1 1980"} + matches = simple_mpi.get_block_data(session, data, algo_config) + assert len(matches) == 0 From abbfbf1cc8320653fe35250938bd3e29e5a9e3fa Mon Sep 17 00:00:00 2001 From: Eric Buckley Date: Thu, 19 Sep 2024 12:56:42 -0700 Subject: [PATCH 12/19] removing dep on python-dotenv --- alembic/env.py | 5 ++--- pyproject.toml | 1 - src/recordlinker/linkage/models.py | 8 +++----- 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/alembic/env.py b/alembic/env.py index 992d7a34..50b1ec0a 100644 --- a/alembic/env.py +++ b/alembic/env.py @@ -5,8 +5,6 @@ from alembic import context -import dotenv -ENV = dotenv.dotenv_values() # this is the Alembic Config object, which provides # access to the values within the .ini file in use. @@ -16,7 +14,8 @@ # This line sets up loggers basically. if config.config_file_name is not None: fileConfig(config.config_file_name) -config.set_main_option("sqlalchemy.url", ENV["DB_URI"]) +from recordlinker.config import settings +config.set_main_option("sqlalchemy.url", settings.DB_URI) # add your model's MetaData object here # for 'autogenerate' support diff --git a/pyproject.toml b/pyproject.toml index 496edbcc..b005fdcd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,7 +38,6 @@ dev = [ "pyarrow", "httpx", "alembic", - "python-dotenv" ] prod = [ # List any additional production-only dependencies here diff --git a/src/recordlinker/linkage/models.py b/src/recordlinker/linkage/models.py index 877edadd..58fe2146 100644 --- a/src/recordlinker/linkage/models.py +++ b/src/recordlinker/linkage/models.py @@ -3,21 +3,19 @@ import uuid import dateutil.parser -import dotenv from sqlalchemy import create_engine from sqlalchemy import orm from sqlalchemy import schema from sqlalchemy import types as sqltypes +from recordlinker.config import settings + def get_session() -> orm.Session: """ Creates a new session to the MPI database and returns it. """ - db_uri = dotenv.dotenv_values()["DB_URI"] - if not db_uri: - raise ValueError("DB_URI environment variable not set") - engine = create_engine(db_uri) + engine = create_engine(settings.db_uri) Base.metadata.create_all(engine) return orm.Session(engine) From 2f4c8124037fefd1b66befe30c4681c2892b588f Mon Sep 17 00:00:00 2001 From: Eric Buckley Date: Thu, 19 Sep 2024 12:57:17 -0700 Subject: [PATCH 13/19] adding Patient.external_patient_id --- src/recordlinker/linkage/models.py | 1 + src/recordlinker/linkage/simple_mpi.py | 5 ++++- tests/unit/test_simple_mpi.py | 13 +++++++++++++ 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/src/recordlinker/linkage/models.py b/src/recordlinker/linkage/models.py index 58fe2146..64acc336 100644 --- a/src/recordlinker/linkage/models.py +++ b/src/recordlinker/linkage/models.py @@ -39,6 +39,7 @@ class Patient(Base): person_id: orm.Mapped[int] = orm.mapped_column(schema.ForeignKey("mpi_person.id")) person: orm.Mapped["Person"] = orm.relationship(back_populates="patients") data: orm.Mapped[dict] = orm.mapped_column(sqltypes.JSON) + external_patient_id: orm.Mapped[str] = orm.mapped_column(sqltypes.String(255), nullable=True) external_person_id: orm.Mapped[str] = orm.mapped_column(sqltypes.String(255), nullable=True) external_person_source: orm.Mapped[str] = orm.mapped_column(sqltypes.String(100), nullable=True) blocking_values: orm.Mapped[list["BlockingValue"]] = orm.relationship(back_populates="patient") diff --git a/src/recordlinker/linkage/simple_mpi.py b/src/recordlinker/linkage/simple_mpi.py index d99b2c2b..0217e42e 100644 --- a/src/recordlinker/linkage/simple_mpi.py +++ b/src/recordlinker/linkage/simple_mpi.py @@ -57,6 +57,7 @@ def insert_matched_patient( session: orm.Session, data: dict, person_id: typing.Optional[int] = None, + external_patient_id: typing.Optional[str] = None, external_person_id: typing.Optional[str] = None, commit: bool = True, ) -> models.Patient: @@ -66,7 +67,9 @@ def insert_matched_patient( # create a new Person record if one isn't provided person = models.Person() if not person_id else session.get(models.Person, person_id) - patient = models.Patient(person=person, data=data) + patient = models.Patient( + person=person, data=data, external_patient_id=external_patient_id + ) if external_person_id is not None: patient.external_person_id = external_person_id patient.external_person_source = "IRIS" diff --git a/tests/unit/test_simple_mpi.py b/tests/unit/test_simple_mpi.py index 829c773d..b7909548 100644 --- a/tests/unit/test_simple_mpi.py +++ b/tests/unit/test_simple_mpi.py @@ -99,6 +99,19 @@ def test_with_person(self, session): assert patient.external_person_source is None assert len(patient.blocking_values) == 3 + def test_with_person_and_external_patient_id(self, session): + person = models.Person() + session.add(person) + session.flush() + data = {"name": [{"given": ["George",], "family": "Harrison"}]} + patient = simple_mpi.insert_matched_patient(session, data, person_id=person.id, external_patient_id="abc") + assert patient.person_id == person.id + assert patient.data == data + assert patient.external_patient_id == "abc" + assert patient.external_person_id is None + assert patient.external_person_source is None + assert len(patient.blocking_values) == 2 + class TestGetBlockData: @pytest.fixture From 406f87bff1e49b88b658bf75779c6851a2360899 Mon Sep 17 00:00:00 2001 From: Eric Buckley Date: Thu, 19 Sep 2024 16:59:39 -0700 Subject: [PATCH 14/19] BlockingKey.to_value should return set for performance --- src/recordlinker/linkage/models.py | 48 +++++++++++++-------------- tests/unit/test_models.py | 52 +++++++++++++++--------------- tests/unit/test_simple_mpi.py | 6 ++++ 3 files changed, 56 insertions(+), 50 deletions(-) diff --git a/src/recordlinker/linkage/models.py b/src/recordlinker/linkage/models.py index 64acc336..21007595 100644 --- a/src/recordlinker/linkage/models.py +++ b/src/recordlinker/linkage/models.py @@ -59,7 +59,7 @@ class BlockingKey(enum.Enum): """ BIRTHDATE = 1, "Date of Birth" MRN = 2, "Last 4 chars of MRN" - SEX = 3, "Gender" + SEX = 3, "Sex" ZIP = 4, "Zip Code" FIRST_NAME = 5, "First 4 chars of First Name" LAST_NAME = 6, "First 4 chars of Last Name" @@ -68,9 +68,9 @@ def __init__(self, id: int, description: str): self.id = id self.description = description - def to_value(self, data: dict) -> list[str]: + def to_value(self, data: dict) -> set[str]: """ - Given a data dictionary of Patient PII data, return a generator of all + Given a data dictionary of Patient PII data, return a set of all possible values for this Key. Many Keys will only have 1 possible value, but some (like first name) could have multiple values. """ @@ -86,54 +86,54 @@ def to_value(self, data: dict) -> list[str]: return self._extract_first_name_first_four(data) if self == BlockingKey.LAST_NAME: return self._extract_last_name_first_four(data) - return [] + return set() - def _extract_birthdate(self, data: dict) -> list[str]: + def _extract_birthdate(self, data: dict) -> set[str]: if "birthdate" in data: val = data["birthdate"] if not isinstance(val, (datetime.date, datetime.datetime)): # if not an instance of date or datetime, try to parse it val = dateutil.parser.parse(str(val)) - return [val.strftime("%Y-%m-%d")] - return [] + return {val.strftime("%Y-%m-%d")} + return set() - def _extract_mrn_last_four(self, data: dict) -> list[str]: + def _extract_mrn_last_four(self, data: dict) -> set[str]: if "mrn" in data: - return [data["mrn"].strip()[-4:]] - return [] + return {data["mrn"].strip()[-4:]} + return set() - def _extract_sex(self, data: dict) -> list[str]: + def _extract_sex(self, data: dict) -> set[str]: if "sex" in data: val = str(data["sex"]).lower().strip() if val in ["m", "male"]: - return ["m"] + return {"m"} elif val in ["f", "female"]: - return ["f"] - return ["u"] - return [] + return {"f"} + return {"u"} + return set() - def _extract_zipcode(self, data: dict) -> list[str]: - zipcodes = [] + def _extract_zipcode(self, data: dict) -> set[str]: + zipcodes = set() for address in data.get("address", []): if isinstance(address, dict): if "zip" in address: - zipcodes.append(str(address["zip"]).strip()[0:5]) + zipcodes.add(str(address["zip"]).strip()[0:5]) return zipcodes - def _extract_first_name_first_four(self, data: dict) -> list[str]: - names = [] + def _extract_first_name_first_four(self, data: dict) -> set[str]: + names = set() for name in data.get("name", []): if isinstance(name, dict): for given in name.get("given", []): - names.append(str(given)[:4]) + names.add(str(given)[:4]) return names - def _extract_last_name_first_four(self, data: dict) -> list[str]: - names = [] + def _extract_last_name_first_four(self, data: dict) -> set[str]: + names = set() for name in data.get("name", []): if isinstance(name, dict): if "family" in name: - names.append(str(name["family"])[:4]) + names.add(str(name["family"])[:4]) return names diff --git a/tests/unit/test_models.py b/tests/unit/test_models.py index c3abb76f..08763180 100644 --- a/tests/unit/test_models.py +++ b/tests/unit/test_models.py @@ -13,62 +13,62 @@ class TestBlockingKey: def test_extract_birthdate(self): data = {"dob": "01/01/1980"} - assert models.BlockingKey.BIRTHDATE.to_value(data) == [] + assert models.BlockingKey.BIRTHDATE.to_value(data) == set() data = {"birthdate": "01/01/1980"} - assert models.BlockingKey.BIRTHDATE.to_value(data) == ["1980-01-01"] + assert models.BlockingKey.BIRTHDATE.to_value(data) == {"1980-01-01"} data = {"birthdate": datetime.date(1980, 1, 1)} - assert models.BlockingKey.BIRTHDATE.to_value(data) == ["1980-01-01"] + assert models.BlockingKey.BIRTHDATE.to_value(data) == {"1980-01-01"} data = {"birthdate": datetime.datetime(1980, 1, 1, 12, 30, 0)} - assert models.BlockingKey.BIRTHDATE.to_value(data) == ["1980-01-01"] + assert models.BlockingKey.BIRTHDATE.to_value(data) == {"1980-01-01"} def test_extract_mrn_last_four(self): data = {"ssn": "123456789"} - assert models.BlockingKey.MRN.to_value(data) == [] + assert models.BlockingKey.MRN.to_value(data) == set() data = {"mrn": "123456789"} - assert models.BlockingKey.MRN.to_value(data) == ["6789"] + assert models.BlockingKey.MRN.to_value(data) == {"6789"} data = {"mrn": "89"} - assert models.BlockingKey.MRN.to_value(data) == ["89"] + assert models.BlockingKey.MRN.to_value(data) == {"89"} def test_extract_sex(self): data = {"gender": "M"} - assert models.BlockingKey.SEX.to_value(data) == [] + assert models.BlockingKey.SEX.to_value(data) == set() data = {"sex": "M"} - assert models.BlockingKey.SEX.to_value(data) == ["m"] + assert models.BlockingKey.SEX.to_value(data) == {"m"} data = {"sex": "Male"} - assert models.BlockingKey.SEX.to_value(data) == ["m"] + assert models.BlockingKey.SEX.to_value(data) == {"m"} data = {"sex": "f"} - assert models.BlockingKey.SEX.to_value(data) == ["f"] + assert models.BlockingKey.SEX.to_value(data) == {"f"} data = {"sex": "FEMALE"} - assert models.BlockingKey.SEX.to_value(data) == ["f"] + assert models.BlockingKey.SEX.to_value(data) == {"f"} data = {"sex": "other"} - assert models.BlockingKey.SEX.to_value(data) == ["u"] + assert models.BlockingKey.SEX.to_value(data) == {"u"} data = {"sex": "unknown"} - assert models.BlockingKey.SEX.to_value(data) == ["u"] + assert models.BlockingKey.SEX.to_value(data) == {"u"} data = {"sex": "?"} - assert models.BlockingKey.SEX.to_value(data) == ["u"] + assert models.BlockingKey.SEX.to_value(data) == {"u"} def test_extract_zipcode(self): data = {"zip": "12345"} - assert models.BlockingKey.ZIP.to_value(data) == [] + assert models.BlockingKey.ZIP.to_value(data) == set() data = {"address": [{"zip": "12345"}]} - assert models.BlockingKey.ZIP.to_value(data) == ["12345"] + assert models.BlockingKey.ZIP.to_value(data) == {"12345"} data = {"address": [{"zip": "12345-6789"}]} - assert models.BlockingKey.ZIP.to_value(data) == ["12345"] + assert models.BlockingKey.ZIP.to_value(data) == {"12345"} data = {"address": [{"zip": "12345-6789"}, {"zip": "54321"}]} - assert models.BlockingKey.ZIP.to_value(data) == ["12345", "54321"] + assert models.BlockingKey.ZIP.to_value(data) == {"12345", "54321"} def test_extract_first_name_first_four(self): data = {"first_name": "John"} - assert models.BlockingKey.FIRST_NAME.to_value(data) == [] + assert models.BlockingKey.FIRST_NAME.to_value(data) == set() data = {"name": [{"given": ["John", "Jane"]}]} - assert models.BlockingKey.FIRST_NAME.to_value(data) == ["John", "Jane"] - data = {"name": [{"given": ["Janet", "Johnathon"]}]} - assert models.BlockingKey.FIRST_NAME.to_value(data) == ["Jane", "John"] + assert models.BlockingKey.FIRST_NAME.to_value(data) == {"John", "Jane"} + data = {"name": [{"given": ["Janet", "Johnathon"]}, {"given": ["Jane"]}]} + assert models.BlockingKey.FIRST_NAME.to_value(data) == {"Jane", "John"} def test_extract_last_name_first_four(self): data = {"last_name": "Doe"} - assert models.BlockingKey.LAST_NAME.to_value(data) == [] + assert models.BlockingKey.LAST_NAME.to_value(data) == set() data = {"name": [{"family": "Doe"}]} - assert models.BlockingKey.LAST_NAME.to_value(data) == ["Doe"] + assert models.BlockingKey.LAST_NAME.to_value(data) == {"Doe"} data = {"name": [{"family": "Smith"}, {"family": "Doe"}]} - assert models.BlockingKey.LAST_NAME.to_value(data) == ["Smit", "Doe"] + assert models.BlockingKey.LAST_NAME.to_value(data) == {"Smit", "Doe"} diff --git a/tests/unit/test_simple_mpi.py b/tests/unit/test_simple_mpi.py index b7909548..ff4d3682 100644 --- a/tests/unit/test_simple_mpi.py +++ b/tests/unit/test_simple_mpi.py @@ -159,3 +159,9 @@ def test_match_on_birthdate_first_name_and_last_name(self, session, prime_index) data = {"name": [{"given": ["Johnathon", "Bill",], "family": "Doeherty"}], "birthdate": "Jan 1 1980"} matches = simple_mpi.get_block_data(session, data, algo_config) assert len(matches) == 0 + + def test_match_on_multiple_names(self, session, prime_index): + data = {"name": [{"use": "official", "given": ["John", "Doe"], "family": "Smith"}, {"use": "maiden", "given": ["John"], "family": "Doe"}]} + algo_config = {"blocks": [{"value": "first_name"}, {"value": "last_name"}]} + matches = simple_mpi.get_block_data(session, data, algo_config) + assert len(matches) == 4 From 7fcc028474ed4a041b1a7f9f483f12b1f0af6d12 Mon Sep 17 00:00:00 2001 From: Eric Buckley Date: Thu, 19 Sep 2024 17:05:53 -0700 Subject: [PATCH 15/19] fix test --- tests/unit/test_simple_mpi.py | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/tests/unit/test_simple_mpi.py b/tests/unit/test_simple_mpi.py index ff4d3682..7272e5af 100644 --- a/tests/unit/test_simple_mpi.py +++ b/tests/unit/test_simple_mpi.py @@ -49,18 +49,16 @@ def test_patient_with_blocking_keys(self, session, new_patient): new_patient.data = {"name": [{"given": ["Johnathon", "Bill",], "family": "Smith"}], "birthdate": "01/01/1980"} keys = simple_mpi.insert_blocking_keys(session, new_patient) assert len(keys) == 4 - assert keys[0].patient_id == new_patient.id - assert keys[0].blockingkey == models.BlockingKey.BIRTHDATE.id - assert keys[0].value == "1980-01-01" - assert keys[1].patient_id == new_patient.id - assert keys[1].blockingkey == models.BlockingKey.FIRST_NAME.id - assert keys[1].value == "John" - assert keys[2].patient_id == new_patient.id - assert keys[2].blockingkey == models.BlockingKey.FIRST_NAME.id - assert keys[2].value == "Bill" - assert keys[3].patient_id == new_patient.id - assert keys[3].blockingkey == models.BlockingKey.LAST_NAME.id - assert keys[3].value == "Smit" + for key in keys: + assert keys[0].patient_id == new_patient.id + if key.blockingkey == models.BlockingKey.BIRTHDATE.id: + assert key.value == "1980-01-01" + elif key.blockingkey == models.BlockingKey.FIRST_NAME.id: + assert key.value in ["John", "Bill"] + elif key.blockingkey == models.BlockingKey.LAST_NAME.id: + assert key.value == "Smit" + else: + assert False, f"Unexpected blocking key: {key.blockingkey}" class TestInsertMatchedPatient: @@ -119,7 +117,7 @@ def prime_index(self, session): data = [ {"name": [{"given": ["Johnathon", "Bill",], "family": "Smith"}], "birthdate": "01/01/1980"}, {"name": [{"given": ["George",], "family": "Harrison"}], "birthdate": "1943-2-25"}, - {"name": [{"given": ["John",], "family": "Doe"}], "birthdate": "1980-01-01"}, + {"name": [{"given": ["John",], "family": "Doe"}, {"given": ["John"], "family": "Lewis"}], "birthdate": "1980-01-01"}, {"name": [{"given": ["Bill",], "family": "Smith"}], "birthdate": "1980-01-01"}, {"name": [{"given": ["John",], "family": "Smith"}], "birthdate": "1980-01-01"}, {"name": [{"given": ["John",], "family": "Smith"}], "birthdate": "1985-11-12"}, From cf27dfa35bbd068c51df26191288e0f6643098be Mon Sep 17 00:00:00 2001 From: Eric Buckley Date: Thu, 19 Sep 2024 20:00:08 -0700 Subject: [PATCH 16/19] better support for empty PII blocking values --- src/recordlinker/linkage/models.py | 13 +++++++------ tests/unit/test_simple_mpi.py | 23 ++++++++++++++++++----- 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/src/recordlinker/linkage/models.py b/src/recordlinker/linkage/models.py index 21007595..c3ea73a6 100644 --- a/src/recordlinker/linkage/models.py +++ b/src/recordlinker/linkage/models.py @@ -89,7 +89,7 @@ def to_value(self, data: dict) -> set[str]: return set() def _extract_birthdate(self, data: dict) -> set[str]: - if "birthdate" in data: + if data.get("birthdate"): val = data["birthdate"] if not isinstance(val, (datetime.date, datetime.datetime)): # if not an instance of date or datetime, try to parse it @@ -98,12 +98,12 @@ def _extract_birthdate(self, data: dict) -> set[str]: return set() def _extract_mrn_last_four(self, data: dict) -> set[str]: - if "mrn" in data: + if data.get("mrn"): return {data["mrn"].strip()[-4:]} return set() def _extract_sex(self, data: dict) -> set[str]: - if "sex" in data: + if data.get("sex"): val = str(data["sex"]).lower().strip() if val in ["m", "male"]: return {"m"} @@ -116,7 +116,7 @@ def _extract_zipcode(self, data: dict) -> set[str]: zipcodes = set() for address in data.get("address", []): if isinstance(address, dict): - if "zip" in address: + if address.get("zip"): zipcodes.add(str(address["zip"]).strip()[0:5]) return zipcodes @@ -125,14 +125,15 @@ def _extract_first_name_first_four(self, data: dict) -> set[str]: for name in data.get("name", []): if isinstance(name, dict): for given in name.get("given", []): - names.add(str(given)[:4]) + if given: + names.add(str(given)[:4]) return names def _extract_last_name_first_four(self, data: dict) -> set[str]: names = set() for name in data.get("name", []): if isinstance(name, dict): - if "family" in name: + if name.get("family"): names.add(str(name["family"])[:4]) return names diff --git a/tests/unit/test_simple_mpi.py b/tests/unit/test_simple_mpi.py index 7272e5af..346792df 100644 --- a/tests/unit/test_simple_mpi.py +++ b/tests/unit/test_simple_mpi.py @@ -121,11 +121,24 @@ def prime_index(self, session): {"name": [{"given": ["Bill",], "family": "Smith"}], "birthdate": "1980-01-01"}, {"name": [{"given": ["John",], "family": "Smith"}], "birthdate": "1980-01-01"}, {"name": [{"given": ["John",], "family": "Smith"}], "birthdate": "1985-11-12"}, + {"name": [{"given": ["Ferris",], "family": "Bueller"}], "birthdate": ""}, ] for datum in data: simple_mpi.insert_matched_patient(session, datum) - def test_match_on_birthdate(self, session, prime_index): + def test_block_missing_data(self, session, prime_index): + data = {"name": [{"given": ["Johnathon", "Bill",], "family": "Smith"}]} + algo_config = {"blocks": [{"value": "birthdate"}]} + matches = simple_mpi.get_block_data(session, data, algo_config) + assert len(matches) == 0 + + def test_block_empty_block_key(self, session, prime_index): + data = {"name": [{"given": ["Ferris",], "family": "Bueller"}], "birthdate": ""} + algo_config = {"blocks": [{"value": "birthdate"}, {"value": "first_name"}]} + matches = simple_mpi.get_block_data(session, data, algo_config) + assert len(matches) == 0 + + def test_block_on_birthdate(self, session, prime_index): data = {"name": [{"given": ["Johnathon", "Bill",], "family": "Smith"}], "birthdate": "01/01/1980"} algo_config = {"blocks": [{"value": "birthdate"}]} matches = simple_mpi.get_block_data(session, data, algo_config) @@ -134,19 +147,19 @@ def test_match_on_birthdate(self, session, prime_index): matches = simple_mpi.get_block_data(session, data, algo_config) assert len(matches) == 1 - def test_match_on_first_name(self, session, prime_index): + def test_block_on_first_name(self, session, prime_index): data = {"name": [{"given": ["Johnathon", "Bill",], "family": "Smith"}], "birthdate": "01/01/1980"} algo_config = {"blocks": [{"value": "first_name"}]} matches = simple_mpi.get_block_data(session, data, algo_config) assert len(matches) == 5 - def test_match_on_birthdate_and_first_name(self, session, prime_index): + def test_block_on_birthdate_and_first_name(self, session, prime_index): data = {"name": [{"given": ["Johnathon", "Bill",], "family": "Smith"}], "birthdate": "01/01/1980"} algo_config = {"blocks": [{"value": "birthdate"}, {"value": "first_name"}]} matches = simple_mpi.get_block_data(session, data, algo_config) assert len(matches) == 4 - def test_match_on_birthdate_first_name_and_last_name(self, session, prime_index): + def test_block_on_birthdate_first_name_and_last_name(self, session, prime_index): data = {"name": [{"given": ["Johnathon", "Bill",], "family": "Smith"}], "birthdate": "01/01/1980"} algo_config = {"blocks": [{"value": "birthdate"}, {"value": "first_name"}, {"value": "last_name"}]} matches = simple_mpi.get_block_data(session, data, algo_config) @@ -158,7 +171,7 @@ def test_match_on_birthdate_first_name_and_last_name(self, session, prime_index) matches = simple_mpi.get_block_data(session, data, algo_config) assert len(matches) == 0 - def test_match_on_multiple_names(self, session, prime_index): + def test_block_on_multiple_names(self, session, prime_index): data = {"name": [{"use": "official", "given": ["John", "Doe"], "family": "Smith"}, {"use": "maiden", "given": ["John"], "family": "Doe"}]} algo_config = {"blocks": [{"value": "first_name"}, {"value": "last_name"}]} matches = simple_mpi.get_block_data(session, data, algo_config) From 9a5d7ecc6788c0ede8a3673346842c411732a444 Mon Sep 17 00:00:00 2001 From: Eric Buckley Date: Fri, 20 Sep 2024 08:53:26 -0700 Subject: [PATCH 17/19] BlockingKey function cleanup to make it more readable --- src/recordlinker/linkage/models.py | 114 ++++++++++++++++------------- tests/unit/test_models.py | 12 +++ 2 files changed, 76 insertions(+), 50 deletions(-) diff --git a/src/recordlinker/linkage/models.py b/src/recordlinker/linkage/models.py index c3ea73a6..a7627a6f 100644 --- a/src/recordlinker/linkage/models.py +++ b/src/recordlinker/linkage/models.py @@ -1,5 +1,6 @@ import datetime import enum +import typing import uuid import dateutil.parser @@ -39,16 +40,24 @@ class Patient(Base): person_id: orm.Mapped[int] = orm.mapped_column(schema.ForeignKey("mpi_person.id")) person: orm.Mapped["Person"] = orm.relationship(back_populates="patients") data: orm.Mapped[dict] = orm.mapped_column(sqltypes.JSON) - external_patient_id: orm.Mapped[str] = orm.mapped_column(sqltypes.String(255), nullable=True) - external_person_id: orm.Mapped[str] = orm.mapped_column(sqltypes.String(255), nullable=True) - external_person_source: orm.Mapped[str] = orm.mapped_column(sqltypes.String(100), nullable=True) - blocking_values: orm.Mapped[list["BlockingValue"]] = orm.relationship(back_populates="patient") + external_patient_id: orm.Mapped[str] = orm.mapped_column( + sqltypes.String(255), nullable=True + ) + external_person_id: orm.Mapped[str] = orm.mapped_column( + sqltypes.String(255), nullable=True + ) + external_person_source: orm.Mapped[str] = orm.mapped_column( + sqltypes.String(100), nullable=True + ) + blocking_values: orm.Mapped[list["BlockingValue"]] = orm.relationship( + back_populates="patient" + ) class BlockingKey(enum.Enum): """ Enum for the different types of blocking keys that can be used for patient - matching. This is the universe of all possible blocking keys that a user can + matching. This is the universe of all possible blocking keys that a user can choose from when configuring their algorithm. When data is loaded into the MPI, all possible BlockingValues will be created for the defined BlockingKeys. However, only a subset will be used in matching, based on the configuration of @@ -57,6 +66,7 @@ class BlockingKey(enum.Enum): **HERE BE DRAGONS**: IN A PRODUCTION SYSTEM, THESE ENUMS SHOULD NOT BE CHANGED!!! """ + BIRTHDATE = 1, "Date of Birth" MRN = 2, "Last 4 chars of MRN" SEX = 3, "Sex" @@ -74,68 +84,72 @@ def to_value(self, data: dict) -> set[str]: possible values for this Key. Many Keys will only have 1 possible value, but some (like first name) could have multiple values. """ + vals: set[str] = set() if self == BlockingKey.BIRTHDATE: - return self._extract_birthdate(data) + vals.update(self._extract_birthdate(data)) if self == BlockingKey.MRN: - return self._extract_mrn_last_four(data) + vals.update(self._extract_mrn_last4(data)) if self == BlockingKey.SEX: - return self._extract_sex(data) + vals.update(self._extract_sex(data)) if self == BlockingKey.ZIP: - return self._extract_zipcode(data) + vals.update(self._extract_zipcode(data)) if self == BlockingKey.FIRST_NAME: - return self._extract_first_name_first_four(data) + vals.update(self._extract_first_name_first4(data)) if self == BlockingKey.LAST_NAME: - return self._extract_last_name_first_four(data) - return set() + vals.update(self._extract_last_name_first4(data)) + + # remove all empty strings from the set, we never want to block on these + vals.discard("") + return vals - def _extract_birthdate(self, data: dict) -> set[str]: - if data.get("birthdate"): - val = data["birthdate"] + # FIXME: some of the type/isinstance checks below can be removed after we create + # a serializer to capture the input data, as type dict is too flexible + def _extract_birthdate(self, data: dict) -> typing.Iterator[str]: + val = data.get("birthdate") + if val: if not isinstance(val, (datetime.date, datetime.datetime)): # if not an instance of date or datetime, try to parse it val = dateutil.parser.parse(str(val)) - return {val.strftime("%Y-%m-%d")} - return set() + yield val.strftime("%Y-%m-%d") - def _extract_mrn_last_four(self, data: dict) -> set[str]: + def _extract_mrn_last4(self, data: dict) -> typing.Iterator[str]: if data.get("mrn"): - return {data["mrn"].strip()[-4:]} - return set() + yield data["mrn"].strip()[-4:] - def _extract_sex(self, data: dict) -> set[str]: + def _extract_sex(self, data: dict) -> typing.Iterator[str]: if data.get("sex"): val = str(data["sex"]).lower().strip() if val in ["m", "male"]: - return {"m"} + yield "m" elif val in ["f", "female"]: - return {"f"} - return {"u"} - return set() - - def _extract_zipcode(self, data: dict) -> set[str]: - zipcodes = set() - for address in data.get("address", []): - if isinstance(address, dict): - if address.get("zip"): - zipcodes.add(str(address["zip"]).strip()[0:5]) - return zipcodes - - def _extract_first_name_first_four(self, data: dict) -> set[str]: - names = set() - for name in data.get("name", []): - if isinstance(name, dict): - for given in name.get("given", []): - if given: - names.add(str(given)[:4]) - return names - - def _extract_last_name_first_four(self, data: dict) -> set[str]: - names = set() - for name in data.get("name", []): - if isinstance(name, dict): - if name.get("family"): - names.add(str(name["family"])[:4]) - return names + yield "f" + else: + yield "u" + + def _extract_zipcode(self, data: dict) -> typing.Iterator[str]: + val = data.get("address") + if isinstance(val, (list, set)): + for address in val: + if isinstance(address, dict): + if address.get("zip"): + yield str(address["zip"]).strip()[0:5] + + def _extract_first_name_first4(self, data: dict) -> typing.Iterator[str]: + val = data.get("name") + if isinstance(val, (list, set)): + for name in val: + if isinstance(name, dict): + for given in name.get("given", []): + if given: + yield str(given)[:4] + + def _extract_last_name_first4(self, data: dict) -> typing.Iterator[str]: + val = data.get("name") + if isinstance(val, (list, set)): + for name in val: + if isinstance(name, dict): + if name.get("family"): + yield str(name["family"])[:4] class BlockingValue(Base): diff --git a/tests/unit/test_models.py b/tests/unit/test_models.py index 08763180..20510239 100644 --- a/tests/unit/test_models.py +++ b/tests/unit/test_models.py @@ -14,6 +14,8 @@ class TestBlockingKey: def test_extract_birthdate(self): data = {"dob": "01/01/1980"} assert models.BlockingKey.BIRTHDATE.to_value(data) == set() + data = {"birthdate": ""} + assert models.BlockingKey.BIRTHDATE.to_value(data) == set() data = {"birthdate": "01/01/1980"} assert models.BlockingKey.BIRTHDATE.to_value(data) == {"1980-01-01"} data = {"birthdate": datetime.date(1980, 1, 1)} @@ -24,6 +26,8 @@ def test_extract_birthdate(self): def test_extract_mrn_last_four(self): data = {"ssn": "123456789"} assert models.BlockingKey.MRN.to_value(data) == set() + data = {"mrn": None} + assert models.BlockingKey.MRN.to_value(data) == set() data = {"mrn": "123456789"} assert models.BlockingKey.MRN.to_value(data) == {"6789"} data = {"mrn": "89"} @@ -32,6 +36,8 @@ def test_extract_mrn_last_four(self): def test_extract_sex(self): data = {"gender": "M"} assert models.BlockingKey.SEX.to_value(data) == set() + data = {"sex": ""} + assert models.BlockingKey.SEX.to_value(data) == set() data = {"sex": "M"} assert models.BlockingKey.SEX.to_value(data) == {"m"} data = {"sex": "Male"} @@ -50,6 +56,8 @@ def test_extract_sex(self): def test_extract_zipcode(self): data = {"zip": "12345"} assert models.BlockingKey.ZIP.to_value(data) == set() + data = {"address": [{"zip": None}]} + assert models.BlockingKey.ZIP.to_value(data) == set() data = {"address": [{"zip": "12345"}]} assert models.BlockingKey.ZIP.to_value(data) == {"12345"} data = {"address": [{"zip": "12345-6789"}]} @@ -60,6 +68,8 @@ def test_extract_zipcode(self): def test_extract_first_name_first_four(self): data = {"first_name": "John"} assert models.BlockingKey.FIRST_NAME.to_value(data) == set() + data = {"name": [{"given": ["", None]}]} + assert models.BlockingKey.FIRST_NAME.to_value(data) == set() data = {"name": [{"given": ["John", "Jane"]}]} assert models.BlockingKey.FIRST_NAME.to_value(data) == {"John", "Jane"} data = {"name": [{"given": ["Janet", "Johnathon"]}, {"given": ["Jane"]}]} @@ -68,6 +78,8 @@ def test_extract_first_name_first_four(self): def test_extract_last_name_first_four(self): data = {"last_name": "Doe"} assert models.BlockingKey.LAST_NAME.to_value(data) == set() + data = {"name": [{"family": ""}]} + assert models.BlockingKey.LAST_NAME.to_value(data) == set() data = {"name": [{"family": "Doe"}]} assert models.BlockingKey.LAST_NAME.to_value(data) == {"Doe"} data = {"name": [{"family": "Smith"}, {"family": "Doe"}]} From e2d4bbf9c6d3f22f1e4415246f13499c275a30dd Mon Sep 17 00:00:00 2001 From: Eric Buckley Date: Fri, 20 Sep 2024 14:29:12 -0700 Subject: [PATCH 18/19] small change to PR template --- .github/pull_request_template.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 28ad7199..1284a46e 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -7,6 +7,8 @@ ## Additional Notes [Add any additional context or notes that reviewers should know about.] +<--------------------- REMOVE THE LINES BELOW BEFORE MERGING ---------------------> + ## Checklist Please review and complete the following checklist before submitting your pull request: From 0b123ae1c83732a44bd4f50e85fc18d3189525b1 Mon Sep 17 00:00:00 2001 From: Eric Buckley Date: Tue, 24 Sep 2024 10:53:45 -0700 Subject: [PATCH 19/19] adding optimization to shortcirtuit get_block_data early if blocking key value is missing --- src/recordlinker/linkage/simple_mpi.py | 4 ++++ tests/unit/test_simple_mpi.py | 6 ++++++ 2 files changed, 10 insertions(+) diff --git a/src/recordlinker/linkage/simple_mpi.py b/src/recordlinker/linkage/simple_mpi.py index 0217e42e..2b3ee240 100644 --- a/src/recordlinker/linkage/simple_mpi.py +++ b/src/recordlinker/linkage/simple_mpi.py @@ -34,6 +34,10 @@ def get_block_data( key = models.BlockingKey[key_name] # Get all the possible values from the data for this key vals = [v for v in key.to_value(data)] + # If there are no values for a blocking key in the pass, we can skip + # the query and return an empty list, this is just an optimization + if not vals: + return [] # Create a dynamic alias for the Blocking Value table using the index # this is necessary since we are potentially joining the same table # multiple times with different conditions diff --git a/tests/unit/test_simple_mpi.py b/tests/unit/test_simple_mpi.py index 346792df..0f0868ba 100644 --- a/tests/unit/test_simple_mpi.py +++ b/tests/unit/test_simple_mpi.py @@ -176,3 +176,9 @@ def test_block_on_multiple_names(self, session, prime_index): algo_config = {"blocks": [{"value": "first_name"}, {"value": "last_name"}]} matches = simple_mpi.get_block_data(session, data, algo_config) assert len(matches) == 4 + + def test_block_missing_keys(self, session, prime_index): + data = {"birthdate": "01/01/1980"} + algo_config = {"blocks": [{"value": "birthdate"}, {"value": "last_name"}]} + matches = simple_mpi.get_block_data(session, data, algo_config) + assert len(matches) == 0