Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cohort templates #134

Draft
wants to merge 6 commits into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 59 additions & 1 deletion R/CohortConstruction.R
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,25 @@

.checkCohortTables(connection, cohortDatabaseSchema, cohortTableNames)

generatedTemplateCohorts <- c()
if ("isTemplatedCohort" %in% colnames(cohortDefinitionSet)) {
cohortDefinitionSet <- cohortDefinitionSet |> dplyr::filter(!.data$isTemplatedCohort)
generatedTemplateCohorts <- generateTemplateCohorts(connection = connection,
cohortDefinitionSet = cohortDefinitionSet,
cdmDatabaseSchema = cdmDatabaseSchema,
tempEmulationSchema = tempEmulationSchema,
cohortDatabaseSchema = cohortDatabaseSchema,
cohortTableNames = cohortTableNames,
stopOnError = stopOnError,
incremental = incremental,
incrementalFolder = incrementalFolder)

if (nrow(cohortDefinitionSet) == 0) {
return(invisible(generatedTemplateCohorts))
}
}


if (incremental) {
recordKeepingFile <- file.path(incrementalFolder, "GeneratedCohorts.csv")

Expand Down Expand Up @@ -186,7 +205,7 @@
}

# Convert the list to a data frame
cohortsGenerated <- do.call(rbind, c(cohortsGenerated, subsetsGenerated))
cohortsGenerated <- do.call(rbind, c(cohortsGenerated, subsetsGenerated, generatedTemplateCohorts))

delta <- Sys.time() - start
writeLines(paste("Generating cohort set took", round(delta, 2), attr(delta, "units")))
Expand Down Expand Up @@ -349,3 +368,42 @@
)
return(summary)
}


generateTemplateCohorts <- function(connection,
cohortDefinitionSet,
cdmDatabaseSchema,
tempEmulationSchema,
cohortDatabaseSchema,
cohortTableNames,
stopOnError,
incremental,
incrementalFolder) {

templateDefs <- .getTemplateDefinitions(cohortDefinitionSet)
statusTbl <- data.frame()
for (tpl in templateDefs) {
status <- tryCatch({
ParallelLogger::logInfo("GENERATING TEMPLATE COHORT: ", tpl$getName())
status <- tpl$executeTemplateSql(connection = connection,
cohortDatabaseSchema = cohortDatabaseSchema,
tempEmulationSchema = tempEmulationSchema,
cdmDatabaseSchema = cdmDatabaseSchema,
cohortTableNames = cohortTableNames,
incremental = incremental,
incrementalFolder = incrementalFolder)
}, error = function(err) {
if (stopOnError)
stop(err)

Check warning on line 397 in R/CohortConstruction.R

View check run for this annotation

Codecov / codecov/patch

R/CohortConstruction.R#L396-L397

Added lines #L396 - L397 were not covered by tests

ParallelLogger::logError(error)
return(list(startTime = NA, endTime = NA, generationStatus = "FAILED"))

Check warning on line 400 in R/CohortConstruction.R

View check run for this annotation

Codecov / codecov/patch

R/CohortConstruction.R#L399-L400

Added lines #L399 - L400 were not covered by tests
})
refs <- tpl$getTemplateReferences(connection = connection)
statusTbl <- statusTbl |> dplyr::bind_rows(data.frame(cohortId = refs$cohortId,
cohortName = refs$cohortName,
generationStatus = status$generationStatus,
startTime = status$startTime,
endTime = status$endTime))
}
}
7 changes: 7 additions & 0 deletions R/CohortDefinitionSet.R
Original file line number Diff line number Diff line change
Expand Up @@ -373,6 +373,13 @@ saveCohortDefinitionSet <- function(cohortDefinitionSet,
checkmate::assert_true(length(cohortFileNameValue) > 0)
assertSettingsColumns(names(cohortDefinitionSet))
checkmate::assert_true(all(cohortFileNameValue %in% names(cohortDefinitionSet)))

if (length(.getTemplateDefinitions(cohortDefinitionSet)) > 0) {
warning("Saving template cohort definitions is not currently supported")
if (all(cohortDefinitionSet$isTemplatedCohort))
stop("Cohort contains only template cohorts, cannot be saved")
}

settingsFolder <- dirname(settingsFileName)
if (!dir.exists(settingsFolder)) {
dir.create(settingsFolder, recursive = TRUE)
Expand Down
13 changes: 9 additions & 4 deletions R/SubsetDefinitions.R
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,8 @@ CohortSubsetDefinition <- R6::R6Class(
#' Returns vector of join, logic, having statements returned by subset operations
#' @param targetOutputPair Target output pair
getSubsetQuery = function(targetOutputPair) {
checkmate::assertIntegerish(targetOutputPair, len = 2)
checkmate::assertNumeric(targetOutputPair, len = 2)
checkmate::assertTRUE(all(targetOutputPair %% 1 == 0))
checkmate::assertFALSE(targetOutputPair[[1]] == targetOutputPair[[2]])

targetTable <- "#cohort_sub_base"
Expand Down Expand Up @@ -133,7 +134,8 @@ CohortSubsetDefinition <- R6::R6Class(
#' @param cohortDefinitionSet Cohort definition set containing base names
#' @param targetOutputPair Target output pair
getSubsetCohortName = function(cohortDefinitionSet, targetOutputPair) {
checkmate::assertIntegerish(targetOutputPair, len = 2)
checkmate::assertNumeric(targetOutputPair, len = 2)
checkmate::assertTRUE(all(targetOutputPair %% 1 == 0))
checkmate::assertFALSE(targetOutputPair[[1]] == targetOutputPair[[2]])
checkmate::assertTRUE(targetOutputPair[[1]] %in% cohortDefinitionSet$cohortId)
checkmate::assertTRUE(isCohortDefinitionSet(cohortDefinitionSet))
Expand All @@ -156,7 +158,9 @@ CohortSubsetDefinition <- R6::R6Class(
#' Set the targetOutputPairs to be added to a cohort definition set
#' @param targetIds list of cohort ids to apply subsetting operations to
setTargetOutputPairs = function(targetIds) {
checkmate::assertIntegerish(targetIds, min.len = 1, upper = 10e11)
checkmate::assertNumeric(targetIds, min.len = 1)
checkmate::assertTRUE(all(targetIds %% 1 == 0))

definitionId <- self$definitionId
targetOutputPairs <- list()

Expand Down Expand Up @@ -190,7 +194,8 @@ CohortSubsetDefinition <- R6::R6Class(
targetOutputPairs,
function(targetOutputPair) {
targetOutputPair <- as.numeric(targetOutputPair)
checkmate::assertIntegerish(targetOutputPair, len = 2, upper = 10e11)
checkmate::assertNumeric(targetOutputPair, len = 2)
checkmate::assertTRUE(all(targetOutputPair %% 1 == 0))
checkmate::assertFALSE(targetOutputPair[[1]] == targetOutputPair[[2]])
targetOutputPair
}
Expand Down
239 changes: 239 additions & 0 deletions R/TemplateCohorts.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,239 @@
# Copyright 2024 Observational Health Data Sciences and Informatics
#
# This file is part of CohortGenerator
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

#' Class for automating the creation of bulk cohorts
#'
#' This class provides a framework for automating the creation of bulk cohorts
#' by defining template SQL queries and associated callbacks to execute them.
#' This is useful when defining lots of exposure or outcomes for cohorts that are very general in nature.
#' For example, all rxNorm ingredient cohorts, all ATC ingredient cohorts or all SNOMED condition occurences with > x
#' diagnosis codes.
#'
#' These cohorts can then be subsetted with common cohort subset operations such as limiting to specific age, gender,
#' or observation criteria, should this be excluded from the cohort definition. However, when applying operations in
#' bulk it may be more efficient to include such definitions within the template sql itself.
#' This approach is also useful for cohorts that are not based on ATLAS/CirceDefinitions
#'
#'
#' @section Public Functions:
#' \describe{
#' \item{\code{initialize}}{Initializes the CohortTemplate object with the specified
#' \code{templateRefFun} and \code{executeFun} functions, along with their
#' respective arguments.}
#' \item{\code{executeTemplateSql}}{Executes the SQL queries defined in the
#' \code{executeFun} function.}
#' \item{\code{getTemplateReferences}}{Executes the \code{templateRefFun} function
#' and retrieves template references, ensuring they are returned as data frames.}
#' }
#'
CohortTemplateDefinition <- R6::R6Class(
"CohortTemplateDefinition",
private = list(
.id = NULL,
.checksum = NULL,
.name = NULL,
generateId = function(...) {
private$.checksum <- digest::digest(list(...))
private$.id <- paste0("CohortTemplate_", private$.checksum)
}
),
public = list(
executeArgs = NULL,
templateRefArgs = NULL,
requireConnectionRefs = NULL,
templateRefFun = NULL,
executeFun = NULL,
initialize = function(name, templateRefFun, executeFun, templateRefArgs = list(), executeArgs = list(), requireConnectionRefs = FALSE) {
# Check if templateRefFun and executeFun are functions
checkmate::assertFunction(templateRefFun)
checkmate::assertFunction(executeFun)
checkmate::assertList(templateRefArgs)
checkmate::assertList(executeArgs)

checkmate::assertTRUE("connection" %in% names(formals(executeFun)))

if (requireConnectionRefs) {
checkmate::assertTRUE("connection" %in% names(formals(templateRefFun)))
}

private$.name <- name
private$generateId(templateRefFun, executeFun, templateRefArgs, executeArgs)
self$templateRefFun <- templateRefFun
self$executeFun <- executeFun
self$templateRefArgs <- templateRefArgs
self$executeArgs <- executeArgs
self$requireConnectionRefs <- requireConnectionRefs
},

executeTemplateSql = function(connection,
cohortDatabaseSchema,
cdmDatabaseSchema,
tempEmulationSchema,
cohortTableNames,
incremental,
incrementalFolder) {
generate <- TRUE
if (incremental) {
recordKeepingFile <- file.path(incrementalFolder, "GeneratedTemplateCohorts.csv")
generate <- isTaskRequired(
templateId = self$getId(),
checksum = self$getChecksum(),
recordKeepingFile = recordKeepingFile
)
}

if (generate) {
args <- self$executeArgs
args$connection <- connection
args$cohortDatabaseSchema <- cohortDatabaseSchema
args$cdmDatabaseSchema <- cdmDatabaseSchema
args$cohortTableNames <- cohortTableNames
args$tempEmulationSchema <- tempEmulationSchema
start <- Sys.time()
do.call(self$executeFun, args)
end <- Sys.time()
status <- "COMPLETE"
if (incremental) {
recordTasksDone(
templateId = self$getId(),
checksum = self$getChecksum(),
recordKeepingFile = recordKeepingFile
)
}
} else {
start <- NA
end <- NA
status <- "SKIPPED"
}

return(list(startTime = start, endTime = end, generationStatus = status))
},

getTemplateReferences = function(connection = NULL) {
args <- self$templateRefArgs
# Call templateRefFun and check if it returns a data frame
if (self$requireConnectionRefs) {
args$connection <- connection
}

result <- do.call(self$templateRefFun, args)
checkmate::assertDataFrame(result)
return(result)
},

getName = function() {
return(private$.name)
},

getId = function() {
return(private$.id)
},

getChecksum = function() {
return(private$.checksum)
}
)
)

#' Create Cohort Template Definition
#' @description construct a cohort template definition
createCohortTemplateDefintion <- function(name,
templateRefFun,
executeFun,
templateRefArgs,
executeArgs,
requireConnectionRefs) {
# templateRefFun, executeFun, templateRefArgs = list(), executeArgs = list(), requireConnectionRefs = FALSE
def <- CohortTemplateDefinition$new(name = name,
templateRefFun = templateRefFun,
executeFun = executeFun,
templateRefArgs = templateRefArgs,
executeArgs = executeArgs,
requireConnectionRefs = requireConnectionRefs)

return(invisible(def))
}

.getTemplateDefinitions <- function(cohortDefinitionSet) {
templates <- attr(cohortDefinitionSet, "templateCohortDefinitions")
if (is.null(templates)) {
templates <- list()
}
return(templates)
}


#' Add Cohort template definition to cohort set
#' @description Adds a cohort template definition to an existing cohort definition set or creates one if none provided
#' @inheritParams generateCohortSet
#' @export
#' @param connection An optional connection. If the cohort
#' @param cohortTemplateDefintion An instance of CohortTemplateDefinition (or subclass)
addCohortTemplateDefintion <- function(cohortDefinitionSet = createEmptyCohortDefinitionSet(),
connection = NULL,
cohortTemplateDefintion) {
checkmate::assertTRUE(isCohortDefinitionSet(cohortDefinitionSet))
checkmate::assertR6(cohortTemplateDefintion, "CohortTemplateDefinition")

if (is.null(connection) & cohortTemplateDefintion$requireConnectionRefs)
stop("Template definition requires connection to CDM to generate references (e.g. for use of vocabulary tables)")

if (is.null(attr(cohortDefinitionSet, "templateCohortDefinitions"))) {
attr(cohortDefinitionSet, "templateCohortDefinitions") <- list()

if (nrow(cohortDefinitionSet) > 0)
cohortDefinitionSet$isTemplatedCohort <- FALSE

Check warning on line 198 in R/TemplateCohorts.R

View check run for this annotation

Codecov / codecov/patch

R/TemplateCohorts.R#L198

Added line #L198 was not covered by tests
}
tplId <- cohortTemplateDefintion$getId()
templateDefs <- attr(cohortDefinitionSet, "templateCohortDefinitions")
if (tplId %in% names(templateDefs)) {
stop("Template definition with the same ID already added to cohort definition set")

Check warning on line 203 in R/TemplateCohorts.R

View check run for this annotation

Codecov / codecov/patch

R/TemplateCohorts.R#L203

Added line #L203 was not covered by tests
}

references <- cohortTemplateDefintion$getTemplateReferences(connection = connection)
if (nrow(references) == 0) {
stop("No references found")
}

checkmate::assertNames(colnames(references),
must.include = c(
"cohortId",
"cohortName"
)
)

if (!"json" %in% colnames(references)) {
references$json <- paste("{}")
}

# Cohort ID in sql for unqiueness in checksum
if (is.null(references$sql))
references$sql <- paste0("SELECT '", references$cohortId, " - ", cohortTemplateDefintion$getName(), "';")

references$isTemplatedCohort <- TRUE

# Assert ids are not in
if (any(references$cohortId %in% cohortDefinitionSet$cohortId)) {
stop("Cannot add reference set to cohort as it would result in non-unique cohort identifiers")

Check warning on line 230 in R/TemplateCohorts.R

View check run for this annotation

Codecov / codecov/patch

R/TemplateCohorts.R#L230

Added line #L230 was not covered by tests
}

templateDefs[[tplId]] <- cohortTemplateDefintion
attr(cohortDefinitionSet, "templateCohortDefinitions") <- templateDefs
cohortDefinitionSet <- dplyr::bind_rows(cohortDefinitionSet,
references)

return(cohortDefinitionSet)
}
Loading
Loading