Skip to content

Commit db05f43

Browse files
Merge pull request #527 from OHDSI/na-status-refactor
NotApplicable fix for measureConditionEraCompleteness
2 parents 3af4d8f + 9e451d3 commit db05f43

5 files changed

+289
-120
lines changed

R/calculateNotApplicableStatus.R

+191
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
# Copyright 2023 Observational Health Data Sciences and Informatics
2+
#
3+
# This file is part of DataQualityDashboard
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
#' Determines if all checks are present expected to calculate the 'Not Applicable' status
18+
#'
19+
#' @param checkResults A dataframe containing the results of the data quality checks
20+
#'
21+
#' @keywords internal
22+
.hasNAchecks <- function(checkResults) {
23+
checkNames <- unique(checkResults$checkName)
24+
return(.containsNAchecks(checkNames))
25+
}
26+
27+
#' Determines if all checks required for 'Not Applicable' status are in the checkNames
28+
#'
29+
#' @param checkNames A character vector of check names
30+
#'
31+
#' @keywords internal
32+
.containsNAchecks <- function(checkNames) {
33+
naCheckNames <- c("cdmTable", "cdmField", "measureValueCompleteness")
34+
missingNAChecks <- !(naCheckNames %in% checkNames)
35+
if (any(missingNAChecks)) {
36+
return(FALSE)
37+
}
38+
return(TRUE)
39+
}
40+
41+
#' Applies the 'Not Applicable' status to a single check
42+
#'
43+
#' @param x Results from a single check
44+
#'
45+
#' @keywords internal
46+
.applyNotApplicable <- function(x) {
47+
# Errors precede all other statuses
48+
if (x$isError == 1) {
49+
return(0)
50+
}
51+
52+
# No NA status for cdmTable and cdmField if missing
53+
if (x$checkName == "cdmTable" || x$checkName == "cdmField") {
54+
return(0)
55+
}
56+
57+
if (any(x$tableIsMissing, x$fieldIsMissing, x$tableIsEmpty, na.rm = TRUE)) {
58+
return(1)
59+
}
60+
61+
# No NA status for measureValueCompleteness if empty
62+
if (x$checkName == "measureValueCompleteness") {
63+
return(0)
64+
}
65+
66+
if (any(x$fieldIsEmpty, x$conceptIsMissing, x$conceptAndUnitAreMissing, na.rm = TRUE)) {
67+
return(1)
68+
}
69+
70+
return(0)
71+
}
72+
73+
#' Determines if check should be notApplicable and the notApplicableReason
74+
#'
75+
#' @param checkResults A dataframe containing the results of the data quality checks
76+
#'
77+
#' @keywords internal
78+
.calculateNotApplicableStatus <- function(checkResults) {
79+
# Look up missing tables and add variable tableIsMissing to checkResults
80+
missingTables <- checkResults %>%
81+
dplyr::filter(
82+
.data$checkName == "cdmTable"
83+
) %>%
84+
dplyr::mutate(
85+
.data$cdmTableName,
86+
tableIsMissing = .data$failed == 1,
87+
.keep = "none"
88+
)
89+
90+
# Look up missing fields and add variable fieldIsMissing to checkResults
91+
missingFields <- checkResults %>%
92+
dplyr::filter(
93+
.data$checkName == "cdmField"
94+
) %>%
95+
dplyr::mutate(
96+
.data$cdmTableName,
97+
.data$cdmFieldName,
98+
fieldIsMissing = .data$failed == 1,
99+
.keep = "none"
100+
)
101+
102+
# Look up empty tables and add variable tableIsEmpty to checkResults
103+
emptyTables <- checkResults %>%
104+
dplyr::filter(
105+
.data$checkName == "measureValueCompleteness"
106+
) %>%
107+
dplyr::mutate(
108+
.data$cdmTableName,
109+
tableIsEmpty = .data$numDenominatorRows == 0,
110+
.keep = "none"
111+
) %>%
112+
dplyr::distinct()
113+
114+
# Look up empty fields and add variable fieldIsEmpty to checkResults
115+
emptyFields <- checkResults %>%
116+
dplyr::filter(
117+
.data$checkName == "measureValueCompleteness"
118+
) %>%
119+
dplyr::mutate(
120+
.data$cdmTableName,
121+
.data$cdmFieldName,
122+
fieldIsEmpty = .data$numDenominatorRows == .data$numViolatedRows,
123+
.keep = "none"
124+
)
125+
126+
# Assign notApplicable status
127+
checkResults <- checkResults %>%
128+
dplyr::left_join(
129+
missingTables,
130+
by = "cdmTableName"
131+
) %>%
132+
dplyr::left_join(
133+
missingFields,
134+
by = c("cdmTableName", "cdmFieldName")
135+
) %>%
136+
dplyr::left_join(
137+
emptyTables,
138+
by = "cdmTableName"
139+
) %>%
140+
dplyr::left_join(
141+
emptyFields,
142+
by = c("cdmTableName", "cdmFieldName")
143+
) %>%
144+
dplyr::mutate(
145+
conceptIsMissing = .data$checkLevel == "CONCEPT" & is.na(.data$unitConceptId) & .data$numDenominatorRows == 0,
146+
conceptAndUnitAreMissing = .data$checkLevel == "CONCEPT" & !is.na(.data$unitConceptId) & .data$numDenominatorRows == 0,
147+
fieldIsMissing = dplyr::coalesce(.data$fieldIsMissing, !is.na(.data$cdmFieldName)),
148+
fieldIsEmpty = dplyr::coalesce(.data$fieldIsEmpty, !is.na(.data$cdmFieldName)),
149+
)
150+
151+
checkResults$notApplicable <- NA
152+
checkResults$notApplicableReason <- NA
153+
154+
conditionOccurrenceIsMissing <- missingTables %>% dplyr::filter(.data$cdmTableName == "CONDITION_OCCURRENCE") %>% dplyr::pull(tableIsMissing)
155+
conditionOccurrenceIsEmpty <- emptyTables %>% dplyr::filter(.data$cdmTableName == "CONDITION_OCCURRENCE") %>% dplyr::pull(tableIsEmpty)
156+
for (i in seq_len(nrow(checkResults))) {
157+
# Special rule for measureConditionEraCompleteness, which should be notApplicable if CONDITION_OCCURRENCE is empty
158+
if (checkResults[i, "checkName"] == "measureConditionEraCompleteness") {
159+
if (conditionOccurrenceIsMissing || conditionOccurrenceIsEmpty) {
160+
checkResults$notApplicable[i] <- 1
161+
checkResults$notApplicableReason[i] <- "Table CONDITION_OCCURRENCE is empty."
162+
} else {
163+
checkResults$notApplicable[i] <- 0
164+
}
165+
} else {
166+
checkResults$notApplicable[i] <- .applyNotApplicable(checkResults[i, ])
167+
}
168+
}
169+
170+
checkResults <- checkResults %>%
171+
dplyr::mutate(
172+
notApplicableReason = ifelse(
173+
.data$notApplicable == 1,
174+
dplyr::case_when(
175+
!is.na(.data$notApplicableReason) ~ .data$notApplicableReason,
176+
.data$tableIsMissing ~ sprintf("Table %s does not exist.", .data$cdmTableName),
177+
.data$fieldIsMissing ~ sprintf("Field %s.%s does not exist.", .data$cdmTableName, .data$cdmFieldName),
178+
.data$tableIsEmpty ~ sprintf("Table %s is empty.", .data$cdmTableName),
179+
.data$fieldIsEmpty ~ sprintf("Field %s.%s is not populated.", .data$cdmTableName, .data$cdmFieldName),
180+
.data$conceptIsMissing ~ sprintf("%s=%s is missing from the %s table.", .data$cdmFieldName, .data$conceptId, .data$cdmTableName),
181+
.data$conceptAndUnitAreMissing ~ sprintf("Combination of %s=%s, unitConceptId=%s and VALUE_AS_NUMBER IS NOT NULL is missing from the %s table.", .data$cdmFieldName, .data$conceptId, .data$unitConceptId, .data$cdmTableName) #nolint
182+
),
183+
NA
184+
),
185+
failed = ifelse(.data$notApplicable == 1, 0, .data$failed),
186+
passed = ifelse(.data$failed == 0 & .data$isError == 0 & .data$notApplicable == 0, 1, 0)
187+
) %>%
188+
dplyr::select(-c("tableIsMissing", "fieldIsMissing", "tableIsEmpty", "fieldIsEmpty", "conceptIsMissing", "conceptAndUnitAreMissing"))
189+
190+
return(checkResults)
191+
}

R/evaluateThresholds.R

+2-112
Original file line numberDiff line numberDiff line change
@@ -163,119 +163,9 @@
163163
}
164164
}
165165

166-
missingTables <- dplyr::select(
167-
dplyr::filter(checkResults, .data$checkName == "cdmTable" & .data$failed == 1),
168-
"cdmTableName"
169-
)
170-
if (nrow(missingTables) > 0) {
171-
missingTables$tableIsMissing <- 1
172-
checkResults <- dplyr::mutate(
173-
dplyr::left_join(checkResults, missingTables, by = "cdmTableName"),
174-
tableIsMissing = ifelse(.data$checkName != "cdmTable" & .data$isError == 0, .data$tableIsMissing, NA)
175-
)
176-
} else {
177-
checkResults$tableIsMissing <- NA
166+
if (.hasNAchecks(checkResults)) {
167+
checkResults <- .calculateNotApplicableStatus(checkResults)
178168
}
179169

180-
missingFields <- dplyr::select(
181-
dplyr::filter(checkResults, .data$checkName == "cdmField" & .data$failed == 1 & is.na(.data$tableIsMissing)),
182-
"cdmTableName", "cdmFieldName"
183-
)
184-
if (nrow(missingFields) > 0) {
185-
missingFields$fieldIsMissing <- 1
186-
checkResults <- dplyr::mutate(
187-
dplyr::left_join(checkResults, missingFields, by = c("cdmTableName", "cdmFieldName")),
188-
fieldIsMissing = ifelse(.data$checkName != "cdmField" & .data$isError == 0, .data$fieldIsMissing, NA)
189-
)
190-
} else {
191-
checkResults$fieldIsMissing <- NA
192-
}
193-
194-
emptyTables <- dplyr::distinct(
195-
dplyr::select(
196-
dplyr::filter(checkResults, .data$checkName == "measureValueCompleteness" &
197-
.data$numDenominatorRows == 0 &
198-
.data$isError == 0 &
199-
is.na(.data$tableIsMissing) &
200-
is.na(.data$fieldIsMissing)),
201-
"cdmTableName"
202-
)
203-
)
204-
if (nrow(emptyTables) > 0) {
205-
emptyTables$tableIsEmpty <- 1
206-
checkResults <- dplyr::mutate(
207-
dplyr::left_join(checkResults, emptyTables, by = c("cdmTableName")),
208-
tableIsEmpty = ifelse(.data$checkName != "cdmField" & .data$checkName != "cdmTable" & .data$isError == 0, .data$tableIsEmpty, NA)
209-
)
210-
} else {
211-
checkResults$tableIsEmpty <- NA
212-
}
213-
214-
emptyFields <-
215-
dplyr::select(
216-
dplyr::filter(checkResults, .data$checkName == "measureValueCompleteness" &
217-
.data$numDenominatorRows == .data$numViolatedRows &
218-
is.na(.data$tableIsMissing) & is.na(.data$fieldIsMissing) & is.na(.data$tableIsEmpty)),
219-
"cdmTableName", "cdmFieldName"
220-
)
221-
if (nrow(emptyFields) > 0) {
222-
emptyFields$fieldIsEmpty <- 1
223-
checkResults <- dplyr::mutate(
224-
dplyr::left_join(checkResults, emptyFields, by = c("cdmTableName", "cdmFieldName")),
225-
fieldIsEmpty = ifelse(.data$checkName != "measureValueCompleteness" & .data$checkName != "cdmField" & .data$checkName != "isRequired" & .data$isError == 0, .data$fieldIsEmpty, NA)
226-
)
227-
} else {
228-
checkResults$fieldIsEmpty <- NA
229-
}
230-
231-
checkResults <- dplyr::mutate(
232-
checkResults,
233-
conceptIsMissing = ifelse(
234-
.data$isError == 0 &
235-
is.na(.data$tableIsMissing) &
236-
is.na(.data$fieldIsMissing) &
237-
is.na(.data$tableIsEmpty) &
238-
is.na(.data$fieldIsEmpty) &
239-
.data$checkLevel == "CONCEPT" &
240-
is.na(.data$unitConceptId) &
241-
.data$numDenominatorRows == 0,
242-
1,
243-
NA
244-
)
245-
)
246-
247-
checkResults <- dplyr::mutate(
248-
checkResults,
249-
conceptAndUnitAreMissing = ifelse(
250-
.data$isError == 0 &
251-
is.na(.data$tableIsMissing) &
252-
is.na(.data$fieldIsMissing) &
253-
is.na(.data$tableIsEmpty) &
254-
is.na(.data$fieldIsEmpty) &
255-
.data$checkLevel == "CONCEPT" &
256-
!is.na(.data$unitConceptId) &
257-
.data$numDenominatorRows == 0,
258-
1,
259-
NA
260-
)
261-
)
262-
263-
checkResults <- dplyr::mutate(
264-
checkResults,
265-
notApplicable = dplyr::coalesce(.data$tableIsMissing, .data$fieldIsMissing, .data$tableIsEmpty, .data$fieldIsEmpty, .data$conceptIsMissing, .data$conceptAndUnitAreMissing, 0),
266-
notApplicableReason = dplyr::case_when(
267-
!is.na(.data$tableIsMissing) ~ sprintf("Table %s does not exist.", .data$cdmTableName),
268-
!is.na(.data$fieldIsMissing) ~ sprintf("Field %s.%s does not exist.", .data$cdmTableName, .data$cdmFieldName),
269-
!is.na(.data$tableIsEmpty) ~ sprintf("Table %s is empty.", .data$cdmTableName),
270-
!is.na(.data$fieldIsEmpty) ~ sprintf("Field %s.%s is not populated.", .data$cdmTableName, .data$cdmFieldName),
271-
!is.na(.data$conceptIsMissing) ~ sprintf("%s=%s is missing from the %s table.", .data$cdmFieldName, .data$conceptId, .data$cdmTableName),
272-
!is.na(.data$conceptAndUnitAreMissing) ~ sprintf("Combination of %s=%s, unitConceptId=%s and VALUE_AS_NUMBER IS NOT NULL is missing from the %s table.", .data$cdmFieldName, .data$conceptId, .data$unitConceptId, .data$cdmTableName)
273-
)
274-
)
275-
276-
checkResults <- dplyr::select(checkResults, -c("tableIsMissing", "fieldIsMissing", "tableIsEmpty", "fieldIsEmpty", "conceptIsMissing", "conceptAndUnitAreMissing"))
277-
checkResults <- dplyr::mutate(checkResults, failed = ifelse(.data$notApplicable == 1, 0, .data$failed))
278-
checkResults <- dplyr::mutate(checkResults, passed = ifelse(.data$failed == 0 & .data$isError == 0 & .data$notApplicable == 0, 1, 0))
279-
280170
checkResults
281171
}

R/executeDqChecks.R

+2-8
Original file line numberDiff line numberDiff line change
@@ -114,16 +114,10 @@ executeDqChecks <- function(connectionDetails,
114114
stopifnot(is.character(cdmVersion))
115115

116116
# Warning if check names for determining NA is missing
117-
if (!length(checkNames) == 0) {
118-
naCheckNames <- c("cdmTable", "cdmField", "measureValueCompleteness")
119-
missingNAChecks <- !(naCheckNames %in% checkNames)
120-
if (any(missingNAChecks)) {
121-
missingNACheckNames <- paste(naCheckNames[missingNAChecks], collapse = ", ")
122-
warning(sprintf("Missing check names to calculate the 'Not Applicable' status: %s", missingNACheckNames))
123-
}
117+
if (length(checkNames) > 0 && !.containsNAchecks(checkNames)) {
118+
warning("Missing check names to calculate the 'Not Applicable' status.")
124119
}
125120

126-
127121
# temporary patch to work around vroom 1.6.4 bug
128122
readr::local_edition(1)
129123

tests/testthat/setup.R

+3
Original file line numberDiff line numberDiff line change
@@ -12,3 +12,6 @@ if (Sys.getenv("DONT_DOWNLOAD_JDBC_DRIVERS", "") == "TRUE") {
1212
connectionDetailsEunomia <- Eunomia::getEunomiaConnectionDetails()
1313
cdmDatabaseSchemaEunomia <- "main"
1414
resultsDatabaseSchemaEunomia <- "main"
15+
16+
# Separate connection details for NA tests, as this requires removing records
17+
connectionDetailsEunomiaNaChecks <- Eunomia::getEunomiaConnectionDetails()

0 commit comments

Comments
 (0)