OHDSI · azimov · Mar 26, 2024 · Mar 26, 2024 · Mar 26, 2024 · Mar 27, 2024
diff --git a/NAMESPACE b/NAMESPACE
@@ -2,22 +2,30 @@
 
 export(CohortSubsetDefinition)
 export(CohortSubsetOperator)
+export(CohortTemplateDefinition)
 export(DemographicSubsetOperator)
 export(LimitSubsetOperator)
 export(SubsetCohortWindow)
 export(SubsetOperator)
 export(addCohortSubsetDefinition)
+export(addCohortTemplateDefintion)
+export(addSqlCohortDefinition)
 export(checkAndFixCohortDefinitionSetDataTypes)
 export(computeChecksum)
+export(createAtcCohortTemplateDefinition)
 export(createCohortSubset)
 export(createCohortSubsetDefinition)
 export(createCohortTables)
+export(createCohortTemplateDefintion)
 export(createDemographicSubset)
 export(createEmptyCohortDefinitionSet)
 export(createEmptyNegativeControlOutcomeCohortSet)
 export(createLimitSubset)
 export(createResultsDataModel)
+export(createRxNormCohortTemplateDefinition)
+export(createSnomedCohortTemplateDefinition)
 export(createSubsetCohortWindow)
+export(createUnionCohortTemplate)
 export(dropCohortStatsTables)
 export(exportCohortStatsTables)
 export(generateCohortSet)

diff --git a/NEWS.md b/NEWS.md
@@ -1,6 +1,9 @@
 CohortGenerator 0.12.0
 ======================
 
+- creation of cohort_checksum tables that enable verifcation of generated cohorts and incremental execution in distributed
+environments
+
 - Backwards compatable extension to CohortSubsetOperators and cohortSubsetWindows to allow windowing to be logic of any
 length
 

diff --git a/R/CohortConstruction.R b/R/CohortConstruction.R
diff --git a/R/CohortDefinitionSet.R b/R/CohortDefinitionSet.R
@@ -194,6 +194,8 @@ checkAndFixCohortDefinitionSetDataTypes <- function(x, fixDataTypes = TRUE, emit
 #'
 #' @param sqlFolder        The name of the folder that will hold the SQL representation
 #'                         of the cohort.
+#' @param templateFolder  Defines the folder to store sql template cohorts that can be loaded as part of the definition
+#'                        Json files are loaded into cohort definition set
 #'
 #' @param cohortFileNameFormat  Defines the format string  for naming the cohort
 #'                              JSON and SQL files. The format string follows the
@@ -221,6 +223,7 @@ getCohortDefinitionSet <- function(settingsFileName = "Cohorts.csv",
                                    cohortFileNameFormat = "%s",
                                    cohortFileNameValue = c("cohortId"),
                                    subsetJsonFolder = "inst/cohort_subset_definitions/",
+                                   templateFolder = "inst/cohort_template_definitions/",
                                    packageName = NULL,
                                    warnOnMissingJson = TRUE,
                                    verbose = FALSE) {
@@ -299,6 +302,8 @@ getCohortDefinitionSet <- function(settingsFileName = "Cohorts.csv",
   }
 
   cohortDefinitionSet <- cbind(settings, fileData)
+  cohortDefinitionSet <- loadTemplateDefinitionsFolder(cohortDefinitionSet, templateFolder)
+
   # Loading cohort subset definitions with their associated targets
   if (loadSubsets & nrow(subsetsToLoad) > 0) {
     if (dir.exists(subsetJsonFolder)) {
@@ -355,6 +360,8 @@ getCohortDefinitionSet <- function(settingsFileName = "Cohorts.csv",
 #'                              in conjunction with the cohortFileNameFormat parameter.
 #'
 #' @param subsetJsonFolder      Defines the folder to store the subset JSON
+#' @param templateFolder     Defines the folder to store sql template cohorts that can be saved as part of the definition
+#'                              Sql will be copied to this location when `saveCohortDefinitionSet` is called.
 #'
 #' @param verbose           When TRUE, logging messages are emitted to indicate export
 #'                          progress.
@@ -367,12 +374,24 @@ saveCohortDefinitionSet <- function(cohortDefinitionSet,
                                     cohortFileNameFormat = "%s",
                                     cohortFileNameValue = c("cohortId"),
                                     subsetJsonFolder = "inst/cohort_subset_definitions/",
+                                    templateFolder = "inst/cohort_template_definitions/",
                                     verbose = FALSE) {
   checkmate::assertDataFrame(cohortDefinitionSet, min.rows = 1, col.names = "named")
   checkmate::assert_vector(cohortFileNameValue)
   checkmate::assert_true(length(cohortFileNameValue) > 0)
   assertSettingsColumns(names(cohortDefinitionSet))
   checkmate::assert_true(all(cohortFileNameValue %in% names(cohortDefinitionSet)))
+
+  templateDefinitions <- getTemplateDefinitions(cohortDefinitionSet)
+  if (length(templateDefinitions) > 0) {
+    saveCohortTemplateDefinitions(templateDefinitions, templateFolder)
+    if (all(cohortDefinitionSet$isTemplatedCohort))
+      return(invisible())
+    # Don't save templates as regular cohorts
+    cohortDefinitionSet <- cohortDefinitionSet |>
+      dplyr::filter(!.data$isTemplatedCohort)
+  }
+
   settingsFolder <- dirname(settingsFileName)
   if (!dir.exists(settingsFolder)) {
     dir.create(settingsFolder, recursive = TRUE)

diff --git a/R/CohortSample.R b/R/CohortSample.R
@@ -31,10 +31,10 @@
   countSql <- "SELECT COUNT(DISTINCT SUBJECT_ID) as cnt FROM  @cohort_database_schema.@target_table
    WHERE cohort_definition_id = @target_cohort_id"
   count <- DatabaseConnector::renderTranslateQuerySql(connection,
-    countSql,
-    cohort_database_schema = cohortDatabaseSchema,
-    target_cohort_id = targetCohortId,
-    target_table = targetTable
+                                                      countSql,
+                                                      cohort_database_schema = cohortDatabaseSchema,
+                                                      target_cohort_id = targetCohortId,
+                                                      target_table = targetTable
   ) %>%
     dplyr::pull()
 
@@ -64,11 +64,16 @@
                           targetTable,
                           outputCohortId,
                           outputTable,
+                          checksumTable,
                           cohortDatabaseSchema,
                           outputDatabaseSchema,
                           sampleTable,
                           seed,
-                          tempEmulationSchema) {
+                          tempEmulationSchema,
+                          checksum,
+                          incremental,
+                          recordKeepingFile) {
+  startTime <- lubridate::now()
   randSampleTableName <- paste0("#SAMPLE_TABLE_", seed)
   DatabaseConnector::insertTable(
     connection = connection,
@@ -80,9 +85,8 @@
   )
 
   execSql <- SqlRender::readSql(system.file("sql", "sql_server", "sampling", "RandomSample.sql", package = "CohortGenerator"))
-  DatabaseConnector::renderTranslateExecuteSql(connection,
+  execSql <- SqlRender::render(
     execSql,
-    tempEmulationSchema = tempEmulationSchema,
     random_sample_table = randSampleTableName,
     target_cohort_id = targetCohortId,
     output_cohort_id = outputCohortId,
@@ -91,6 +95,18 @@
     output_table = outputTable,
     target_table = targetTable
   )
+  execSql <- SqlRender::translate(execSql,
+                                  targetDialect = DatabaseConnector::dbms(connection))
+
+  .runCohortSql(connection = connection,
+                sql = execSql,
+                startTime = startTime,
+                resultsDatabaseSchema = cohortDatabaseSchema,
+                cohortChecksumTable = checksumTable,
+                incremental = incremental,
+                cohortId = outputCohortId,
+                checksum = checksum,
+                recordKeepingFile = recordKeepingFile)$generationStatus
 }
 
 
@@ -169,11 +185,11 @@ sampleCohortDefinitionSet <- function(cohortDefinitionSet,
   checkmate::assertIntegerish(seed, min.len = 1)
   checkmate::assertDataFrame(cohortDefinitionSet, min.rows = 1, col.names = "named")
   checkmate::assertNames(colnames(cohortDefinitionSet),
-    must.include = c(
-      "cohortId",
-      "cohortName",
-      "sql"
-    )
+                         must.include = c(
+                           "cohortId",
+                           "cohortName",
+                           "sql"
+                         )
   )
 
   if (is.null(n) && is.null(sampleFraction)) {
@@ -204,6 +220,10 @@ sampleCohortDefinitionSet <- function(cohortDefinitionSet,
   }
 
   .checkCohortTables(connection, cohortDatabaseSchema, cohortTableNames)
+  computedChecksums <- getLastGeneratedCohortChecksums(connection = connection,
+                                                       cohortDatabaseSchema = cohortDatabaseSchema,
+                                                       cohortTableNames = cohortTableNames)
+
   sampledCohorts <-
     base::Map(function(seed, targetCohortId) {
       sampledCohortDefinition <- cohortDefinitionSet %>%
@@ -240,12 +260,14 @@ sampleCohortDefinitionSet <- function(cohortDefinitionSet,
         )
       }
 
-      if (incremental && !isTaskRequired(
-        cohortId = outputCohortId,
-        seed = seed,
-        checksum = computeChecksum(paste0(sampledCohortDefinition$sql, n, seed, outputCohortId)),
-        recordKeepingFile = recordKeepingFile
-      )) {
+      sampleChecksum <- computeChecksum(paste0(sampledCohortDefinition$sql, n, seed, outputCohortId))
+      cohortComputed <- computedChecksums |>
+        dplyr::filter(.data$checksum == sampleChecksum,
+                      .data$cohortDefinitionId == outputCohortId) |>
+        dplyr::count() |>
+        dplyr::pull() > 0
+
+      if (incremental && cohortComputed) {
         sampledCohortDefinition$status <- "skipped"
         return(sampledCohortDefinition)
       }
@@ -265,33 +287,28 @@ sampleCohortDefinitionSet <- function(cohortDefinitionSet,
         rlang::inform(paste0("No entires found for ", targetCohortId, " was it generated?"))
         return(sampledCohortDefinition)
       }
+
       # Called only for side effects
-      .sampleCohort(
+      sampledCohortDefinition$status <- .sampleCohort(
         connection = connection,
         targetCohortId = targetCohortId,
         targetTable = cohortTableNames$cohortTable,
         outputCohortId = outputCohortId,
         outputTable = cohortTableNames$cohortSampleTable,
+        checksumTable = cohortTableNames$cohortChecksumTable,
         cohortDatabaseSchema = cohortDatabaseSchema,
         outputDatabaseSchema = outputDatabaseSchema,
         sampleTable = sampleTable,
         seed = seed + targetCohortId, # Seed is unique to each target cohort
-        tempEmulationSchema = tempEmulationSchema
+        tempEmulationSchema = tempEmulationSchema,
+        checksum = sampleChecksum,
+        incremental = incremental,
+        recordKeepingFile = recordKeepingFile
       )
 
-      sampledCohortDefinition$status <- "generated"
-      if (incremental) {
-        recordTasksDone(
-          cohortId = sampledCohortDefinition$cohortId,
-          seed = seed,
-          checksum = computeChecksum(paste0(sampledCohortDefinition$sql, n, seed, outputCohortId)),
-          recordKeepingFile = recordKeepingFile
-        )
-      }
       return(sampledCohortDefinition)
     }, seed, cohortIds) %>%
-    dplyr::bind_rows()
-
+      dplyr::bind_rows()
 
 
   attr(sampledCohorts, "isSampledCohortDefinition") <- TRUE

diff --git a/R/CohortTables.R b/R/CohortTables.R
@@ -35,6 +35,7 @@
 #'                                     inclusion rule statistics.
 #' @param cohortCensorStatsTable       Name of the censor stats table, one of the tables for storing
 #'                                     inclusion rule statistics.
+#' @param cohortChecksumTable          Stores the checksum of the cohort used and the time generation starts and ends
 #'
 #' @returns
 #' A list of the table names as specified in the parameters to this function.
@@ -46,15 +47,17 @@ getCohortTableNames <- function(cohortTable = "cohort",
                                 cohortInclusionResultTable = paste0(cohortTable, "_inclusion_result"),
                                 cohortInclusionStatsTable = paste0(cohortTable, "_inclusion_stats"),
                                 cohortSummaryStatsTable = paste0(cohortTable, "_summary_stats"),
-                                cohortCensorStatsTable = paste0(cohortTable, "_censor_stats")) {
+                                cohortCensorStatsTable = paste0(cohortTable, "_censor_stats"),
+                                cohortChecksumTable = paste0(cohortTable, "_checksum")) {
   return(list(
     cohortTable = cohortTable,
     cohortSampleTable = cohortSampleTable,
     cohortInclusionTable = cohortInclusionTable,
     cohortInclusionResultTable = cohortInclusionResultTable,
     cohortInclusionStatsTable = cohortInclusionStatsTable,
     cohortSummaryStatsTable = cohortSummaryStatsTable,
-    cohortCensorStatsTable = cohortCensorStatsTable
+    cohortCensorStatsTable = cohortCensorStatsTable,
+    cohortChecksumTable = cohortChecksumTable
   ))
 }
 
@@ -121,13 +124,15 @@ createCohortTables <- function(connectionDetails = NULL,
       create_cohort_inclusion_stats_table = createTableFlagList$cohortInclusionStatsTable,
       create_cohort_summary_stats_table = createTableFlagList$cohortSummaryStatsTable,
       create_cohort_censor_stats_table = createTableFlagList$cohortCensorStatsTable,
+      create_cohort_checksum_table = createTableFlagList$cohortChecksumTable,
       cohort_table = cohortTableNames$cohortTable,
       cohort_sample_table = cohortTableNames$cohortSampleTable,
       cohort_inclusion_table = cohortTableNames$cohortInclusionTable,
       cohort_inclusion_result_table = cohortTableNames$cohortInclusionResultTable,
       cohort_inclusion_stats_table = cohortTableNames$cohortInclusionStatsTable,
       cohort_summary_stats_table = cohortTableNames$cohortSummaryStatsTable,
       cohort_censor_stats_table = cohortTableNames$cohortCensorStatsTable,
+      cohort_checksum_table = cohortTableNames$cohortChecksumTable,
       warnOnMissingParameters = TRUE
     )
     sql <- SqlRender::translate(

diff --git a/R/Incremental.R b/R/Incremental.R
@@ -22,6 +22,7 @@
 #' to store in a record keeping file. This function leverages the md5
 #' hash from the digest package
 #'
+#'
 #' @param val   The value to hash. It is converted to a character to perform
 #'              the hash.
 #'
@@ -30,7 +31,12 @@
 #'
 #' @export
 computeChecksum <- function(val) {
-  return(sapply(as.character(val), digest::digest, algo = "md5", serialize = FALSE))
+  val <- as.character(val)
+  # strip whitespace
+  val <- gsub("[\r\n]", "", val)
+  val <- trimws(val)
+  hashes <- sapply(val, digest::digest, algo = "md5", serialize = FALSE, USE.NAMES = FALSE)
+  return(hashes)
 }
 
 #' Is a task required when running in incremental mode

diff --git a/R/SubsetDefinitions.R b/R/SubsetDefinitions.R
@@ -92,7 +92,8 @@ CohortSubsetDefinition <- R6::R6Class(
     #' Returns vector of join, logic, having statements returned by subset operations
     #' @param targetOutputPair              Target output pair
     getSubsetQuery = function(targetOutputPair) {
-      checkmate::assertIntegerish(targetOutputPair, len = 2)
+      checkmate::assertNumeric(targetOutputPair, len = 2)
+      checkmate::assertTRUE(all(targetOutputPair %% 1 == 0))
       checkmate::assertFALSE(targetOutputPair[[1]] == targetOutputPair[[2]])
 
       targetTable <- "#cohort_sub_base"
@@ -133,7 +134,8 @@ CohortSubsetDefinition <- R6::R6Class(
     #' @param cohortDefinitionSet           Cohort definition set containing base names
     #' @param targetOutputPair              Target output pair
     getSubsetCohortName = function(cohortDefinitionSet, targetOutputPair) {
-      checkmate::assertIntegerish(targetOutputPair, len = 2)
+      checkmate::assertNumeric(targetOutputPair, len = 2)
+      checkmate::assertTRUE(all(targetOutputPair %% 1 == 0))
       checkmate::assertFALSE(targetOutputPair[[1]] == targetOutputPair[[2]])
       checkmate::assertTRUE(targetOutputPair[[1]] %in% cohortDefinitionSet$cohortId)
       checkmate::assertTRUE(isCohortDefinitionSet(cohortDefinitionSet))
@@ -156,7 +158,9 @@ CohortSubsetDefinition <- R6::R6Class(
     #' Set the targetOutputPairs to be added to a cohort definition set
     #' @param targetIds   list of cohort ids to apply subsetting operations to
     setTargetOutputPairs = function(targetIds) {
-      checkmate::assertIntegerish(targetIds, min.len = 1, upper = 10e11)
+      checkmate::assertNumeric(targetIds, min.len = 1)
+      checkmate::assertTRUE(all(targetIds %% 1 == 0))
+
       definitionId <- self$definitionId
       targetOutputPairs <- list()
 
@@ -190,7 +194,8 @@ CohortSubsetDefinition <- R6::R6Class(
         targetOutputPairs,
         function(targetOutputPair) {
           targetOutputPair <- as.numeric(targetOutputPair)
-          checkmate::assertIntegerish(targetOutputPair, len = 2, upper = 10e11)
+          checkmate::assertNumeric(targetOutputPair, len = 2)
+          checkmate::assertTRUE(all(targetOutputPair %% 1 == 0))
           checkmate::assertFALSE(targetOutputPair[[1]] == targetOutputPair[[2]])
           targetOutputPair
         }