|
| 1 | +# Copyright 2023 Observational Health Data Sciences and Informatics |
| 2 | +# |
| 3 | +# This file is part of DataQualityDashboard |
| 4 | +# |
| 5 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 6 | +# you may not use this file except in compliance with the License. |
| 7 | +# You may obtain a copy of the License at |
| 8 | +# |
| 9 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | +# |
| 11 | +# Unless required by applicable law or agreed to in writing, software |
| 12 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 13 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | +# See the License for the specific language governing permissions and |
| 15 | +# limitations under the License. |
| 16 | + |
| 17 | +#' Determines if all checks are present expected to calculate the 'Not Applicable' status |
| 18 | +#' |
| 19 | +#' @param checkResults A dataframe containing the results of the data quality checks |
| 20 | +#' |
| 21 | +#' @keywords internal |
| 22 | +.hasNAchecks <- function(checkResults) { |
| 23 | + checkNames <- unique(checkResults$checkName) |
| 24 | + return(.containsNAchecks(checkNames)) |
| 25 | +} |
| 26 | + |
| 27 | +#' Determines if all checks required for 'Not Applicable' status are in the checkNames |
| 28 | +#' |
| 29 | +#' @param checkNames A character vector of check names |
| 30 | +#' |
| 31 | +#' @keywords internal |
| 32 | +.containsNAchecks <- function(checkNames) { |
| 33 | + naCheckNames <- c("cdmTable", "cdmField", "measureValueCompleteness") |
| 34 | + missingNAChecks <- !(naCheckNames %in% checkNames) |
| 35 | + if (any(missingNAChecks)) { |
| 36 | + return(FALSE) |
| 37 | + } |
| 38 | + return(TRUE) |
| 39 | +} |
| 40 | + |
| 41 | +#' Applies the 'Not Applicable' status to a single check |
| 42 | +#' |
| 43 | +#' @param x Results from a single check |
| 44 | +#' |
| 45 | +#' @keywords internal |
| 46 | +.applyNotApplicable <- function(x) { |
| 47 | + # Errors precede all other statuses |
| 48 | + if (x$isError == 1) { |
| 49 | + return(0) |
| 50 | + } |
| 51 | + |
| 52 | + # No NA status for cdmTable and cdmField if missing |
| 53 | + if (x$checkName == "cdmTable" || x$checkName == "cdmField") { |
| 54 | + return(0) |
| 55 | + } |
| 56 | + |
| 57 | + if (any(x$tableIsMissing, x$fieldIsMissing, x$tableIsEmpty, na.rm = TRUE)) { |
| 58 | + return(1) |
| 59 | + } |
| 60 | + |
| 61 | + # No NA status for measureValueCompleteness if empty |
| 62 | + if (x$checkName == "measureValueCompleteness") { |
| 63 | + return(0) |
| 64 | + } |
| 65 | + |
| 66 | + if (any(x$fieldIsEmpty, x$conceptIsMissing, x$conceptAndUnitAreMissing, na.rm = TRUE)) { |
| 67 | + return(1) |
| 68 | + } |
| 69 | + |
| 70 | + return(0) |
| 71 | +} |
| 72 | + |
| 73 | +#' Determines if check should be notApplicable and the notApplicableReason |
| 74 | +#' |
| 75 | +#' @param checkResults A dataframe containing the results of the data quality checks |
| 76 | +#' |
| 77 | +#' @keywords internal |
| 78 | +.calculateNotApplicableStatus <- function(checkResults) { |
| 79 | + # Look up missing tables and add variable tableIsMissing to checkResults |
| 80 | + missingTables <- checkResults %>% |
| 81 | + dplyr::filter( |
| 82 | + .data$checkName == "cdmTable" |
| 83 | + ) %>% |
| 84 | + dplyr::mutate( |
| 85 | + .data$cdmTableName, |
| 86 | + tableIsMissing = .data$failed == 1, |
| 87 | + .keep = "none" |
| 88 | + ) |
| 89 | + |
| 90 | + # Look up missing fields and add variable fieldIsMissing to checkResults |
| 91 | + missingFields <- checkResults %>% |
| 92 | + dplyr::filter( |
| 93 | + .data$checkName == "cdmField" |
| 94 | + ) %>% |
| 95 | + dplyr::mutate( |
| 96 | + .data$cdmTableName, |
| 97 | + .data$cdmFieldName, |
| 98 | + fieldIsMissing = .data$failed == 1, |
| 99 | + .keep = "none" |
| 100 | + ) |
| 101 | + |
| 102 | + # Look up empty tables and add variable tableIsEmpty to checkResults |
| 103 | + emptyTables <- checkResults %>% |
| 104 | + dplyr::filter( |
| 105 | + .data$checkName == "measureValueCompleteness" |
| 106 | + ) %>% |
| 107 | + dplyr::mutate( |
| 108 | + .data$cdmTableName, |
| 109 | + tableIsEmpty = .data$numDenominatorRows == 0, |
| 110 | + .keep = "none" |
| 111 | + ) %>% |
| 112 | + dplyr::distinct() |
| 113 | + |
| 114 | + # Look up empty fields and add variable fieldIsEmpty to checkResults |
| 115 | + emptyFields <- checkResults %>% |
| 116 | + dplyr::filter( |
| 117 | + .data$checkName == "measureValueCompleteness" |
| 118 | + ) %>% |
| 119 | + dplyr::mutate( |
| 120 | + .data$cdmTableName, |
| 121 | + .data$cdmFieldName, |
| 122 | + fieldIsEmpty = .data$numDenominatorRows == .data$numViolatedRows, |
| 123 | + .keep = "none" |
| 124 | + ) |
| 125 | + |
| 126 | + # Assign notApplicable status |
| 127 | + checkResults <- checkResults %>% |
| 128 | + dplyr::left_join( |
| 129 | + missingTables, |
| 130 | + by = "cdmTableName" |
| 131 | + ) %>% |
| 132 | + dplyr::left_join( |
| 133 | + missingFields, |
| 134 | + by = c("cdmTableName", "cdmFieldName") |
| 135 | + ) %>% |
| 136 | + dplyr::left_join( |
| 137 | + emptyTables, |
| 138 | + by = "cdmTableName" |
| 139 | + ) %>% |
| 140 | + dplyr::left_join( |
| 141 | + emptyFields, |
| 142 | + by = c("cdmTableName", "cdmFieldName") |
| 143 | + ) %>% |
| 144 | + dplyr::mutate( |
| 145 | + conceptIsMissing = .data$checkLevel == "CONCEPT" & is.na(.data$unitConceptId) & .data$numDenominatorRows == 0, |
| 146 | + conceptAndUnitAreMissing = .data$checkLevel == "CONCEPT" & !is.na(.data$unitConceptId) & .data$numDenominatorRows == 0, |
| 147 | + fieldIsMissing = dplyr::coalesce(.data$fieldIsMissing, !is.na(.data$cdmFieldName)), |
| 148 | + fieldIsEmpty = dplyr::coalesce(.data$fieldIsEmpty, !is.na(.data$cdmFieldName)), |
| 149 | + ) |
| 150 | + |
| 151 | + checkResults$notApplicable <- NA |
| 152 | + checkResults$notApplicableReason <- NA |
| 153 | + |
| 154 | + conditionOccurrenceIsMissing <- missingTables %>% dplyr::filter(.data$cdmTableName == "CONDITION_OCCURRENCE") %>% dplyr::pull(tableIsMissing) |
| 155 | + conditionOccurrenceIsEmpty <- emptyTables %>% dplyr::filter(.data$cdmTableName == "CONDITION_OCCURRENCE") %>% dplyr::pull(tableIsEmpty) |
| 156 | + for (i in seq_len(nrow(checkResults))) { |
| 157 | + # Special rule for measureConditionEraCompleteness, which should be notApplicable if CONDITION_OCCURRENCE is empty |
| 158 | + if (checkResults[i, "checkName"] == "measureConditionEraCompleteness") { |
| 159 | + if (conditionOccurrenceIsMissing || conditionOccurrenceIsEmpty) { |
| 160 | + checkResults$notApplicable[i] <- 1 |
| 161 | + checkResults$notApplicableReason[i] <- "Table CONDITION_OCCURRENCE is empty." |
| 162 | + } else { |
| 163 | + checkResults$notApplicable[i] <- 0 |
| 164 | + } |
| 165 | + } else { |
| 166 | + checkResults$notApplicable[i] <- .applyNotApplicable(checkResults[i, ]) |
| 167 | + } |
| 168 | + } |
| 169 | + |
| 170 | + checkResults <- checkResults %>% |
| 171 | + dplyr::mutate( |
| 172 | + notApplicableReason = ifelse( |
| 173 | + .data$notApplicable == 1, |
| 174 | + dplyr::case_when( |
| 175 | + !is.na(.data$notApplicableReason) ~ .data$notApplicableReason, |
| 176 | + .data$tableIsMissing ~ sprintf("Table %s does not exist.", .data$cdmTableName), |
| 177 | + .data$fieldIsMissing ~ sprintf("Field %s.%s does not exist.", .data$cdmTableName, .data$cdmFieldName), |
| 178 | + .data$tableIsEmpty ~ sprintf("Table %s is empty.", .data$cdmTableName), |
| 179 | + .data$fieldIsEmpty ~ sprintf("Field %s.%s is not populated.", .data$cdmTableName, .data$cdmFieldName), |
| 180 | + .data$conceptIsMissing ~ sprintf("%s=%s is missing from the %s table.", .data$cdmFieldName, .data$conceptId, .data$cdmTableName), |
| 181 | + .data$conceptAndUnitAreMissing ~ sprintf("Combination of %s=%s, unitConceptId=%s and VALUE_AS_NUMBER IS NOT NULL is missing from the %s table.", .data$cdmFieldName, .data$conceptId, .data$unitConceptId, .data$cdmTableName) #nolint |
| 182 | + ), |
| 183 | + NA |
| 184 | + ), |
| 185 | + failed = ifelse(.data$notApplicable == 1, 0, .data$failed), |
| 186 | + passed = ifelse(.data$failed == 0 & .data$isError == 0 & .data$notApplicable == 0, 1, 0) |
| 187 | + ) %>% |
| 188 | + dplyr::select(-c("tableIsMissing", "fieldIsMissing", "tableIsEmpty", "fieldIsEmpty", "conceptIsMissing", "conceptAndUnitAreMissing")) |
| 189 | + |
| 190 | + return(checkResults) |
| 191 | +} |
0 commit comments