Skip to content

Commit a2798c1

Browse files
Add endpoint to return orphaned persons (#225)
## Description The PR adds an endpoint to return a paginated list of all the persons with no members. While working on this PR I also made some small changes to `get_orphaned_patients`: - added an explicit `ORDER_BY` after discovering some inconsistent results when I included a `limit` but not a `cursor`. I also updated the tests accordingly. - genericized `schemas.PaginatedRefs` so that the paginated results for both get orphaned patients and persons are the same. ## Related Issues #163 ## Additional Notes I spent a good amount of time thinking about how we should execute this query to make it as efficient as possible. My working assumptions were that: 1) the patient table will be larger than the person table, 2) both tables will be large, and 3) orphaned persons will be relatively rare. I considered two approaches, "LEFT JOIN" (which I ultimately landed on) and "NOT EXISTS", but I am open to hearing others. LEFT JOIN approach: ``` SELECT p.* FROM mpi_person p LEFT JOIN mpi_patient pt ON pt.person_id = p.id WHERE pt.id IS NULL ``` From `EXPLAIN QUERY PLAN`, we can see that we are scanning `mpi_person`, using a Bloom filter when scanning the larger patient table to more quickly eliminate rows that definitely do not match any rows in Person, and uses a covering index on `mpi_patient.id`, which should keep things as quick as possible. NOT EXISTS approach: ``` SELECT * FROM mpi_person p WHERE NOT EXISTS ( SELECT 1 FROM mpi_patient pt WHERE pt.person_id = p.id ) ``` This approach is less efficient because of the subquery executes multiple times checking for matching rows in Patient for each row in Person. It would be more efficient if we add an index on Patient.person_id, but I still think LEFT JOIN is a better choice given the assumptions stated above (especially #1). I also briefly considered a view, but given the number of updates to the Patient and Person tables, I don't think this is our best path forward. All that said, I am open to other approaches and would love to get folks' thoughts. --------- Co-authored-by: Eric Buckley <[email protected]>
1 parent 1027b73 commit a2798c1

File tree

8 files changed

+298
-27
lines changed

8 files changed

+298
-27
lines changed

src/recordlinker/database/mpi_service.py

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -386,10 +386,41 @@ def get_orphaned_patients(
386386
"""
387387
Retrieve orphaned Patients in the MPI database, up to the provided limit.
388388
"""
389-
query = select(models.Patient).where(models.Patient.person_id.is_(None)).limit(limit)
389+
query = (
390+
select(models.Patient)
391+
.where(models.Patient.person_id.is_(None))
392+
.order_by(models.Patient.id)
393+
.limit(limit)
394+
)
390395

391396
# Apply cursor if provided
392397
if cursor:
393398
query = query.where(models.Patient.id > cursor)
394399

395400
return session.execute(query).scalars().all()
401+
402+
403+
def get_orphaned_persons(
404+
session: orm.Session,
405+
limit: int | None = 50,
406+
cursor: int | None = None,
407+
) -> typing.Sequence[models.Person]:
408+
"""
409+
Retrieve orphaned Persons in the MPI database, up to the provided limit. If a
410+
cursor (in the form of a person reference_id) is provided, only retrieve Persons
411+
with a reference_id greater than the cursor.
412+
"""
413+
query = (
414+
select(models.Person)
415+
.outerjoin(models.Patient, models.Patient.person_id == models.Person.id)
416+
.filter(models.Patient.id.is_(None))
417+
.order_by(models.Person.id)
418+
)
419+
if cursor:
420+
query = query.filter(models.Person.id > cursor)
421+
422+
query = query.limit(
423+
limit
424+
) # limit applied after cursor to ensure the limit is applied after the JOIN and starts from the cursor after the join
425+
426+
return session.execute(query).scalars().all()

src/recordlinker/routes/patient_router.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ def get_orphaned_patients(
6666
session: orm.Session = fastapi.Depends(get_session),
6767
limit: int | None = fastapi.Query(50, alias="limit", ge=1, le=1000),
6868
cursor: uuid.UUID | None = fastapi.Query(None, alias="cursor"),
69-
) -> schemas.PaginatedPatientRefs:
69+
) -> schemas.PaginatedRefs:
7070
"""
7171
Retrieve patient_reference_id(s) for all Patients that are not linked to a Person.
7272
"""
@@ -91,8 +91,8 @@ def get_orphaned_patients(
9191

9292
patients = service.get_orphaned_patients(session, limit, cur)
9393
if not patients:
94-
return schemas.PaginatedPatientRefs(
95-
patients=[], meta=schemas.PaginatedMetaData(next_cursor=None, next=None)
94+
return schemas.PaginatedRefs(
95+
data=[], meta=schemas.PaginatedMetaData(next_cursor=None, next=None)
9696
)
9797
# Prepare the meta data
9898
next_cursor = patients[-1].reference_id if len(patients) == limit else None
@@ -102,8 +102,8 @@ def get_orphaned_patients(
102102
else None
103103
)
104104

105-
return schemas.PaginatedPatientRefs(
106-
patients=[p.reference_id for p in patients if p.reference_id],
105+
return schemas.PaginatedRefs(
106+
data=[p.reference_id for p in patients if p.reference_id],
107107
meta=schemas.PaginatedMetaData(
108108
next_cursor=next_cursor,
109109
next=pydantic.HttpUrl(next_url) if next_url else None,

src/recordlinker/routes/person_router.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import uuid
1111

1212
import fastapi
13+
import pydantic
1314
import sqlalchemy.orm as orm
1415

1516
from recordlinker import models
@@ -102,6 +103,60 @@ def update_person(
102103
return schemas.PersonRef(person_reference_id=person.reference_id)
103104

104105

106+
@router.get(
107+
"/orphaned", summary="Retrieve orphaned persons", status_code=fastapi.status.HTTP_200_OK
108+
)
109+
def get_orphaned_persons(
110+
request: fastapi.Request,
111+
session: orm.Session = fastapi.Depends(get_session),
112+
limit: int | None = fastapi.Query(50, alias="limit", ge=1, le=1000),
113+
cursor: uuid.UUID | None = fastapi.Query(None, alias="cursor"),
114+
) -> schemas.PaginatedRefs:
115+
"""
116+
Retrieve person_reference_id(s) for all Persons that are not linked to any Patients.
117+
"""
118+
# Check if the cursor is a valid Person reference_id
119+
if cursor:
120+
person = service.get_persons_by_reference_ids(session, cursor)
121+
if not person or person[0] is None:
122+
raise fastapi.HTTPException(
123+
status_code=fastapi.status.HTTP_422_UNPROCESSABLE_ENTITY,
124+
detail=[
125+
{
126+
"loc": ["query", "cursor"],
127+
"msg": "Cursor is an invalid Person reference_id",
128+
"type": "value_error",
129+
}
130+
],
131+
)
132+
# Replace the cursor with the Patient id instead of reference_id
133+
cur = person[0].id
134+
else:
135+
cur = None
136+
137+
persons = service.get_orphaned_persons(session, limit, cur)
138+
if not persons:
139+
return schemas.PaginatedRefs(
140+
data=[], meta=schemas.PaginatedMetaData(next_cursor=None, next=None)
141+
)
142+
143+
# Prepare the meta data
144+
next_cursor = persons[-1].reference_id if len(persons) == limit else None
145+
next_url = (
146+
f"{request.base_url}person/orphaned?limit={limit}&cursor={next_cursor}"
147+
if next_cursor
148+
else None
149+
)
150+
151+
return schemas.PaginatedRefs(
152+
data=[p.reference_id for p in persons if p.reference_id],
153+
meta=schemas.PaginatedMetaData(
154+
next_cursor=next_cursor,
155+
next=pydantic.HttpUrl(next_url) if next_url else None,
156+
),
157+
)
158+
159+
105160
@router.get(
106161
"/{person_reference_id}",
107162
summary="Retrieve a person cluster",

src/recordlinker/schemas/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from .mpi import ErrorDetail
1313
from .mpi import ErrorResponse
1414
from .mpi import PaginatedMetaData
15-
from .mpi import PaginatedPatientRefs
15+
from .mpi import PaginatedRefs
1616
from .mpi import PatientCreatePayload
1717
from .mpi import PatientInfo
1818
from .mpi import PatientPersonRef
@@ -61,5 +61,5 @@
6161
"ErrorDetail",
6262
"ErrorResponse",
6363
"PaginatedMetaData",
64-
"PaginatedPatientRefs",
64+
"PaginatedRefs",
6565
]

src/recordlinker/schemas/mpi.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,6 @@ class PaginatedMetaData(pydantic.BaseModel):
8585
next: pydantic.HttpUrl | None = None
8686

8787

88-
class PaginatedPatientRefs(pydantic.BaseModel):
89-
patients: list[uuid.UUID] = pydantic.Field(...)
88+
class PaginatedRefs(pydantic.BaseModel):
89+
data: list[uuid.UUID] = pydantic.Field(...)
9090
meta: PaginatedMetaData | None

tests/unit/database/test_mpi_service.py

Lines changed: 53 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -925,8 +925,6 @@ def test_get_orphaned_patients_limit(self, session: Session):
925925
assert len(mpi_service.get_orphaned_patients(session, limit=3)) == 2
926926

927927
def test_get_orphaned_patients_cursor(self, session: Session):
928-
# ordered_uuids.sort()
929-
930928
patient1 = models.Patient(person=None, data={"id": 1})
931929
patient2 = models.Patient(person=None, data={"id": 2})
932930
patient3 = models.Patient(person=None, data={"id": 3})
@@ -949,3 +947,56 @@ def test_get_orphaned_patients_cursor(self, session: Session):
949947
patient2,
950948
patient3,
951949
]
950+
951+
952+
class TestGetOrphanedPersons:
953+
def test_get_orphaned_persons_success(self, session: Session):
954+
person1 = models.Person()
955+
person2 = models.Person()
956+
patient1 = models.Patient(person=person1, data={})
957+
session.add_all([patient1, person2])
958+
session.flush()
959+
assert session.query(models.Patient).count() == 1
960+
assert session.query(models.Person).count() == 2
961+
assert mpi_service.get_orphaned_persons(session) == [person2]
962+
963+
def test_get_orphaned_persons_no_persons(self, session: Session):
964+
patient = models.Patient(person=models.Person(), data={})
965+
session.add(patient)
966+
session.flush()
967+
assert mpi_service.get_orphaned_persons(session) == []
968+
969+
def test_get_orphaned_persons_limit(self, session: Session):
970+
# Checks that limit is correctly applied
971+
person1 = models.Person()
972+
person2 = models.Person()
973+
person3 = models.Person()
974+
patient = models.Patient(person=person1, data={})
975+
session.add_all([patient, person2, person3])
976+
session.flush()
977+
978+
assert len(mpi_service.get_orphaned_persons(session, limit=1)) == 1
979+
assert len(mpi_service.get_orphaned_persons(session, limit=2)) == 2
980+
assert len(mpi_service.get_orphaned_persons(session, limit=3)) == 2
981+
982+
def test_get_orphaned_persons_cursor(self, session: Session):
983+
# Checks that cursor is correctly applied
984+
person1 = models.Person(id=1)
985+
person2 = models.Person(id=2)
986+
person3 = models.Person(id=3)
987+
person4 = models.Person(id=4)
988+
patient = models.Patient(person=person4, data={})
989+
session.add_all([patient, person1, person2, person3])
990+
session.flush()
991+
992+
assert mpi_service.get_orphaned_persons(session, limit=1, cursor=person1.id) == [person2]
993+
assert mpi_service.get_orphaned_persons(session, limit=1, cursor=person2.id) == [person3]
994+
assert mpi_service.get_orphaned_persons(session, limit=2, cursor=person2.id) == [person3]
995+
assert mpi_service.get_orphaned_persons(session, limit=2, cursor=person1.id) == [
996+
person2,
997+
person3,
998+
]
999+
assert mpi_service.get_orphaned_persons(session, limit=5, cursor=person1.id) == [
1000+
person2,
1001+
person3,
1002+
]

tests/unit/routes/test_patient_router.py

Lines changed: 37 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -182,25 +182,47 @@ def test_get_orphaned_patients(self, client):
182182
response = client.get("/patient/orphaned")
183183
assert response.status_code == 200
184184
assert response.json() == {
185-
"patients": [str(patient1.reference_id)],
185+
"data": [str(patient1.reference_id)],
186186
"meta": {"next_cursor": None, "next": None},
187187
}
188188

189189
def test_no_orphaned_patients(self, client):
190190
response = client.get("/patient/orphaned")
191191
assert response.status_code == 200
192192
assert response.json() == {
193-
"patients": [],
193+
"data": [],
194194
"meta": {"next_cursor": None, "next": None},
195195
}
196196

197-
def test_get_orphaned_patients_with_cursor(self, client):
198-
ordered_uuids = [uuid.uuid4() for _ in range(3)]
199-
ordered_uuids.sort()
197+
def test_get_orphaned_patients_with_limit(self, client):
198+
patient1 = models.Patient(person=None, data={"id": 1})
199+
patient2 = models.Patient(person=None, data={"id": 2})
200+
client.session.add_all([patient1, patient2])
201+
client.session.flush()
202+
203+
response = client.get("/patient/orphaned?limit=1")
204+
assert response.status_code == 200
205+
assert response.json() == {
206+
"data": [str(patient1.reference_id)],
207+
"meta": {
208+
"next_cursor": str(patient1.reference_id),
209+
"next": f"http://testserver/patient/orphaned?limit=1&cursor={str(patient1.reference_id)}",
210+
},
211+
}
200212

201-
patient1 = models.Patient(person=None, reference_id=ordered_uuids[0])
202-
patient2 = models.Patient(person=None, reference_id=ordered_uuids[1])
203-
patient3 = models.Patient(person=None, reference_id=ordered_uuids[2])
213+
response = client.get("/patient/orphaned?limit=2")
214+
assert response.json() == {
215+
"data": [str(patient1.reference_id), str(patient2.reference_id)],
216+
"meta": {
217+
"next_cursor": str(patient2.reference_id),
218+
"next": f"http://testserver/patient/orphaned?limit=2&cursor={str(patient2.reference_id)}",
219+
},
220+
}
221+
222+
def test_get_orphaned_patients_with_cursor(self, client):
223+
patient1 = models.Patient(person=None, data={"id": 1})
224+
patient2 = models.Patient(person=None, data={"id": 2})
225+
patient3 = models.Patient(person=None, data={"id": 3})
204226
client.session.add_all([patient1, patient2, patient3])
205227
client.session.flush()
206228

@@ -209,27 +231,27 @@ def test_get_orphaned_patients_with_cursor(self, client):
209231
assert response.status_code == 200
210232

211233
assert response.json() == {
212-
"patients": [str(patient2.reference_id)],
234+
"data": [str(patient2.reference_id)],
213235
"meta": {
214-
"next_cursor": str(ordered_uuids[1]),
215-
"next": f"http://testserver/patient/orphaned?limit=1&cursor={str(ordered_uuids[1])}",
236+
"next_cursor": str(patient2.reference_id),
237+
"next": f"http://testserver/patient/orphaned?limit=1&cursor={str(patient2.reference_id)}",
216238
},
217239
}
218240

219241
# Retrieve 2 patients after patient1, return cursor for patient3
220242
response = client.get(f"/patient/orphaned?limit=2&cursor={patient1.reference_id}")
221243
assert response.json() == {
222-
"patients": [str(patient2.reference_id), str(patient3.reference_id)],
244+
"data": [str(patient2.reference_id), str(patient3.reference_id)],
223245
"meta": {
224-
"next_cursor": str(ordered_uuids[2]),
225-
"next": f"http://testserver/patient/orphaned?limit=2&cursor={ordered_uuids[2]}",
246+
"next_cursor": str(patient3.reference_id),
247+
"next": f"http://testserver/patient/orphaned?limit=2&cursor={str(patient3.reference_id)}",
226248
},
227249
}
228250

229251
# Retrieve the 2 orphaned patients after patient1, return no cursor
230252
response = client.get(f"/patient/orphaned?limit=5&cursor={patient1.reference_id}")
231253
assert response.json() == {
232-
"patients": [
254+
"data": [
233255
str(patient2.reference_id),
234256
str(patient3.reference_id),
235257
],

0 commit comments

Comments
 (0)