-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #32 from ccb-hms/variable-qc
Variable QC and per-table list of primary keys
- Loading branch information
Showing
7 changed files
with
427 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,7 @@ | ||
Package: phonto | ||
Type: Package | ||
Title: Using NHANES via Epiconductor | ||
Version: 0.1.1 | ||
Version: 0.1.2 | ||
Maintainer: Laha Ale <[email protected]>, Robert Gentleman <[email protected]> | ||
Authors@R: | ||
c( | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
|
||
|
||
## Manually curated list of variables that form primary keys for a | ||
## given table. For most tables, this is SEQN. | ||
|
||
## This list needs to be updated periodically | ||
|
||
|
||
##' Look up primary keys for a given NHANES table | ||
##' | ||
##' Most NHANES tables contain a variables called SEQN that represents | ||
##' a participant ID, and hence can be used as a primary key for | ||
##' join operations. Some tables are in the long format, potentially | ||
##' containing multiple observations per participant, where other | ||
##' variables along with SEQN serve as a composite primary key. Yet | ||
##' other tables have no participant ID, where the primary key | ||
##' variable depends on the specific context. For some tables | ||
##' (mostly those containing dietary components codes and drug / | ||
##' supplement codes) the intended primary key columns are not | ||
##' actually unique. | ||
##' @title primary_keys: Primary keys for NHANES table | ||
##' @param x Character string giving name of an NHANES table | ||
##' @param require_unique Logical; whether the likely intended primary | ||
##' key variables should be returned even if they do not uniquely | ||
##' identify rows. | ||
##' @return Character vector giving variables that should serve as | ||
##' primary keys for the table. May be \code{NULL} if | ||
##' \code{require_unique = TRUE}. | ||
##' @author Deepayan Sarkar | ||
primary_keys <- function(x, require_unique = FALSE) | ||
{ | ||
stopifnot(length(x) == 1) | ||
## For these tables, there is NO combination that can resonably | ||
## serve as primary key, even though we can find a combination | ||
## that almost works. These "intended" combinations are returned | ||
## by the switch statement below, but here we keep the option to | ||
## short-circuit that lookup and just return NULL, so that the DB | ||
## doesn't try to set any primary keys. | ||
exceptions <- | ||
c("RXQANA_C", "DS1IDS_G", "DS1IDS_J", "DSQ2_B", "DSQIDS_J", | ||
"P_RXQ_RX", "RXQ_RX_B", "DSBI", "DS1IDS_F", "DS1IDS_I", | ||
"P_DS1IDS", "DS2IDS_E", "DSQIDS_G", "DSQIDS_I", "PAQIAF", | ||
"PAQIAF_C", "PAQIAF_D", "RXQANA_B", "DS2IDS_F", "DS2IDS_H", | ||
"DS2IDS_I", "DS2IDS_J", "DSQ2_C", "P_DSQIDS", "PAQIAF_B", | ||
"RXQ_RX_C", "RXQ_RX_D", "RXQ_RX_E", "RXQ_RX_F", "RXQ_RX_G", | ||
"DS1IDS_E", "DS1IDS_H", "P_DS2IDS", "DSQIDS_E", "DSQIDS_H", | ||
"RXQ_RX", "RXQ_RX_H", "RXQ_RX_I", "RXQ_RX_J") | ||
if (require_unique && x %in% exceptions) return(NULL) | ||
switch(x, | ||
## tables which have duplicate SEQN | ||
## Audiometry | ||
P_AUXAR = , AUXAR_I = , AUXAR_J = c("SEQN", "RFXSEAR", "RFXLEVEL"), | ||
P_AUXTYM = , AUXTYM_I = , AUXTYM_J = c("SEQN", "TYXPEAR"), | ||
P_AUXWBR = , AUXWBR_I = , AUXWBR_J = c("SEQN", "WBXFEAR"), | ||
## Diet | ||
P_DR1IFF = , DR1IFF_C = , DR1IFF_D = , DR1IFF_E = , | ||
DR1IFF_F = , DR1IFF_G = , DR1IFF_H = , DR1IFF_I = , | ||
DR1IFF_J = c("SEQN", "DR1ILINE"), | ||
P_DR2IFF = , DR2IFF_C = , DR2IFF_D = , DR2IFF_E = , | ||
DR2IFF_F = , DR2IFF_G = , DR2IFF_H = , DR2IFF_I = , | ||
DR2IFF_J = c("SEQN", "DR2ILINE"), | ||
DRXIFF = , DRXIFF_B = c("SEQN", "DRXILINE"), | ||
## Dietary supplements | ||
DS1IDS_E = , DS1IDS_F = , DS1IDS_G = , DS1IDS_H = , DS1IDS_I = , | ||
DS2IDS_E = , DS2IDS_F = , DS2IDS_G = , DS2IDS_H = , DS2IDS_I = , | ||
DSQ2_B = , DSQ2_C = , DSQ2_D = , | ||
DSQFILE2 = c("SEQN", "DSDSUPID"), | ||
## Dietary supplements, but can be repeated (for multiple sources) | ||
DSQIDS_E = , DSQIDS_F = , DSQIDS_G = , DSQIDS_H = , | ||
DSQIDS_I = c("SEQN", "DSDSUPID"), | ||
## NCHS supplement id variable name changed in cycle J? | ||
P_DS1IDS = , P_DS2IDS = , P_DSQIDS = , DS1IDS_J = , DS2IDS_J = , | ||
DSQIDS_J = c("SEQN", "DSDPID"), | ||
## Miscellaneous | ||
FFQDC_C = , FFQDC_D = c("SEQN", "FFQ_VAR", "FFQ_FOOD"), | ||
PAQIAF = , PAQIAF_B = , PAQIAF_C = , PAQIAF_D = c("SEQN", "PADACTIV", "PADLEVEL"), | ||
PAXDAY_G = , PAXDAY_H = c("SEQN", "PAXSSNDP"), | ||
PAXHR_G = , PAXHR_H = c("SEQN", "PAXSSNHP"), | ||
P_RXQ_RX = , RXQ_RX = , RXQ_RX_B = , RXQ_RX_C = , RXQ_RX_D = , RXQ_RX_E = , | ||
RXQ_RX_F = , RXQ_RX_G = , RXQ_RX_H = , RXQ_RX_I = , RXQ_RX_J = c("SEQN", "RXDDRGID"), | ||
RXQ_ANA = c("SEQN", "RXQ310"), | ||
RXQANA_B = , RXQANA_C = c("SEQN", "RXD310"), | ||
SSHPV_F = c("SEQN", "SSHPTYPE"), | ||
|
||
## these 'pooled' tables have SAMPLEID | ||
BFRPOL_D = , BFRPOL_E = , BFRPOL_F = , BFRPOL_G = , | ||
BFRPOL_H = , BFRPOL_I = , DOXPOL_D = , DOXPOL_E = , | ||
DOXPOL_F = , DOXPOL_G = , PCBPOL_D = , PCBPOL_E = , | ||
PCBPOL_F = , PCBPOL_G = , PCBPOL_H = , PCBPOL_I = , | ||
PSTPOL_D = , PSTPOL_E = , PSTPOL_F = , PSTPOL_G = , | ||
PSTPOL_H = , PSTPOL_I = "SAMPLEID", | ||
|
||
## These have neither SEQN nor SAMPLEID | ||
|
||
P_DRXFCD = , DRXFCD_C = , DRXFCD_D = , DRXFCD_E = , DRXFCD_F = , | ||
DRXFCD_G = , DRXFCD_H = , DRXFCD_I = , DRXFCD_J = "DRXFDCD", | ||
DRXFMT = , DRXFMT_B = "START", | ||
DRXMCD_C = , DRXMCD_D = , DRXMCD_E = , DRXMCD_F = , DRXMCD_G = "DRXMC", | ||
DSBI = c("DSDIID", "DSDBID"), | ||
DSII = c("DSDPID", "DSDIID"), | ||
DSPI = "DSDPID", | ||
FOODLK_C = , FOODLK_D = "FFQ_FOOD", | ||
PFC_POOL = c("PFCANA", "PFCRACE", "PFCGENDR", "PFCAGE", "PFCPOOL"), | ||
RXQ_DRUG = "RXDDRGID", | ||
SSBFR_B = , SSPCB_B = , SSPST_B = "POOLID", | ||
VARLK_C = , VARLK_D = "FFQ_VAR", | ||
|
||
## default | ||
"SEQN") | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,229 @@ | ||
|
||
|
||
|
||
## QC based on codebook metadata for one variable at a time. The main | ||
## goal is to detect and report inconsistencies in a variable within | ||
## or across cycles. By default, the variable is looked for in all | ||
## cycles, but specific cycles may also be specified | ||
## (QuestionnaireDescriptions has BeginYear and EndYear for each | ||
## table) | ||
|
||
## Metadata: fetch in advance and subset in R, because they are not | ||
## that big, or just get for specific variable? Unless we cache, | ||
## second option is probably better (do once inside qc_var). | ||
|
||
|
||
|
||
.where_clause <- function(variable = NULL, table = NULL) | ||
{ | ||
case <- 1L + length(variable) + 2 * length(table) | ||
switch(case, | ||
"", | ||
sprintf("where Variable = '%s'", variable), | ||
sprintf("where TableName = '%s'", table), | ||
sprintf("where Variable = '%s' and TableName = '%s'", variable, table)) | ||
} | ||
|
||
metadata_cb <- function(variable = NULL, table = NULL) | ||
{ | ||
nhanesQuery(paste("select * from Metadata.VariableCodebook", | ||
.where_clause(variable, table))) | ||
} | ||
metadata_var <- function(variable = NULL, table = NULL) | ||
{ | ||
nhanesQuery(paste("select * from Metadata.QuestionnaireVariables", | ||
.where_clause(variable, table))) | ||
} | ||
metadata_tab <- function(table = NULL) | ||
{ | ||
nhanesQuery(paste("select * from Metadata.QuestionnaireDescriptions", | ||
.where_clause(NULL, table))) | ||
} | ||
|
||
|
||
|
||
## The specific types of discrepancies we look for are: | ||
|
||
## - Whether appears in multiple tables in a given cycle | ||
|
||
## If yes, should be followed up by a check of whether values are consistent | ||
|
||
qc_var_multtable <- function(x, var, cb, tab) | ||
{ | ||
wtable <- subset(var, Variable == x)$TableName | ||
tsub <- subset(tab, TableName %in% wtable) | ||
cycle <- with(tsub, paste(BeginYear, EndYear, sep = "-")) | ||
if (anyDuplicated(cycle)) { | ||
o <- order(cycle, tsub$TableName) | ||
return(list(multiple_tables = data.frame(cycle = cycle[o], | ||
TableName = tsub$TableName[o]))) | ||
} | ||
return(NULL) | ||
} | ||
|
||
|
||
|
||
## - Inconsistency in Description / SasLabel (mostly benign) | ||
|
||
qc_var_description <- function(x, var, cb, tab, ignore.case = FALSE) | ||
{ | ||
description <- subset(var, Variable == x)[["Description"]] | ||
if (ignore.case) description <- tolower(description) | ||
tt <- table(description) | ||
if (length(tt) > 1) list(description_mismatch = table(description)) | ||
else NULL | ||
} | ||
|
||
|
||
qc_var_saslabel <- function(x, var, cb, tab, ignore.case = FALSE) | ||
{ | ||
saslabel <- subset(var, Variable == x)[["SasLabel"]] | ||
if (ignore.case) saslabel <- tolower(saslabel) | ||
tt <- table(saslabel) | ||
if (length(tt) > 1) list(saslabel_mismatch = table(saslabel)) | ||
else NULL | ||
} | ||
|
||
qc_var_target <- function(x, var, cb, tab, ignore.case = FALSE) | ||
{ | ||
target <- subset(var, Variable == x)[["Target"]] | ||
if (ignore.case) target <- tolower(target) | ||
tt <- table(target) | ||
if (length(tt) > 1) list(target_mismatch = table(target)) | ||
else NULL | ||
} | ||
|
||
|
||
|
||
## - Inconsistency in type (numeric / categorical) | ||
|
||
## - Inconsistency in levels for categorical variables (capitalization / other) | ||
|
||
## - Presence of 'special' values in numeric variables, and | ||
## inconsistency in them (including different codes for same | ||
## value). Should have option to exclude common examples like "Don't | ||
## know", "Refused", etc. | ||
|
||
## - Data coarsening (this may be tricky to identify) | ||
|
||
## - Whether variable may be skipped. This requires preparing an | ||
## initial table-level summary. | ||
|
||
## For variables appearing in multiple tables in the same cycle, an | ||
## additional check could be to see if it records the same data. This | ||
## should be a separate check, as it involves accessing the actual | ||
## data. | ||
|
||
|
||
|
||
|
||
|
||
##' QC report for a variable in NHANES | ||
##' | ||
##' @title qc_var: QC on NHANES variable | ||
##' @param x Character string naming a variable in one or more NHANES tables | ||
##' @param var Optional data frame containing variable metadata | ||
##' @param cb Optional data frame containing codebook metadata | ||
##' @param tab Optional data frame containing table metadata | ||
##' @return An object of S3 class \code{"qc_var"} with suitable print and summary methods. | ||
##' @export | ||
##' @author Deepayan Sarkar | ||
qc_var <- function(x, var = metadata_var(x), cb = metadata_cb(x), tab = metadata_tab()) | ||
{ | ||
res <- c(qc_var_multtable(x, var, cb, tab), | ||
qc_var_description(x, var, cb, tab), | ||
qc_var_saslabel(x, var, cb, tab), | ||
qc_var_target(x, var, cb, tab)) | ||
if (is.null(res)) res <- list() | ||
structure(res, | ||
variable = x, | ||
class = "qc_var") | ||
} | ||
|
||
#' @rdname qc_var | ||
#' @export | ||
#' @param object An object of class \code{"qv_var"} | ||
#' @param ... Additional arguments, ignored | ||
summary.qc_var <- function(object, ...) | ||
{ | ||
data.frame(Variable = attr(object, "variable"), | ||
multtable = !is.null(object$multiple_tables), | ||
description = !is.null(object$description_mismatch), | ||
saslabel = !is.null(object$saslabel_mismatch), | ||
target = !is.null(object$target_mismatch)) | ||
} | ||
|
||
|
||
|
||
#' @rdname qc_var | ||
#' @export | ||
print.qc_var <- function(x, ...) | ||
{ | ||
ok <- TRUE | ||
cat("Variable: ", attr(x, "variable")) | ||
if (!is.null(x$multiple_tables)) | ||
{ | ||
ok <- FALSE | ||
cat("\nAppears in multiple tables within same cycle:\n") | ||
## wcycle <- which(duplicated(x$multiple_tables$cycle)) | ||
## wsub <- subset(x$multiple_tables, cycle %in% cycle[wcycle]) | ||
tapply(x$multiple_tables, ~ cycle, function(d) paste(d$TableName, collapse = " / ")) |> | ||
array2DF(responseName = "Tables") |> print() | ||
} | ||
if (!is.null(x$description_mismatch)) | ||
{ | ||
ok <- FALSE | ||
cat("\nMismatch in Description:\n") | ||
print(array2DF(x$description_mismatch, responseName = "Frequency")) | ||
} | ||
if (!is.null(x$saslabel_mismatch)) | ||
{ | ||
ok <- FALSE | ||
cat("\nMismatch in Saslabel:\n") | ||
print(array2DF(x$saslabel_mismatch, responseName = "Frequency")) | ||
} | ||
if (!is.null(x$target_mismatch)) | ||
{ | ||
ok <- FALSE | ||
cat("\nMismatch in Target:\n") | ||
print(array2DF(x$target_mismatch, responseName = "Frequency")) | ||
} | ||
if (ok) cat(" --- no problems found") | ||
invisible(x) | ||
} | ||
|
||
|
||
|
||
|
||
if (FALSE) | ||
{ | ||
var <- metadata_var() | ||
cb <- metadata_cb() | ||
tab <- metadata_tab() | ||
|
||
qc_var("PHAFSTMN", var, cb, tab) | ||
qc_var("LBCBHC", var, cb, tab) | ||
qc_var("ENQ100", var, cb, tab) | ||
qc_var("LBXHCT", var, cb, tab) | ||
|
||
|
||
system.time({ | ||
var <- metadata_var() | ||
cb <- metadata_cb() | ||
tab <- metadata_tab() | ||
qc_var("LBCBHC", var, cb, tab) | ||
}) | ||
|
||
system.time(qc_var("LBCBHC")) | ||
|
||
|
||
qc_var("PHAFSTMN") | ||
qc_var("ENQ100") | ||
|
||
|
||
qc_var("LBCBHC") | ||
qc_var("LBXHCT") | ||
|
||
|
||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
|
||
## Identify questions in a table that might lead to skipping, and | ||
## which variables are potentially skipped as a result. This will need | ||
## to assume that the order is known, which we will take from the | ||
## codebook (make sure that's the order in the database as well). | ||
|
||
|
||
|
||
|
Oops, something went wrong.