Skip to content

Commit

Permalink
Merge pull request #32 from ccb-hms/variable-qc
Browse files Browse the repository at this point in the history
Variable QC and per-table list of primary keys
  • Loading branch information
deepayan authored Mar 9, 2024
2 parents 6e1e5c8 + fa07d4f commit 067e24e
Show file tree
Hide file tree
Showing 7 changed files with 427 additions and 1 deletion.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: phonto
Type: Package
Title: Using NHANES via Epiconductor
Version: 0.1.1
Version: 0.1.2
Maintainer: Laha Ale <[email protected]>, Robert Gentleman <[email protected]>
Authors@R:
c(
Expand Down
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# Generated by roxygen2: do not edit by hand

S3method(print,qc_var)
S3method(summary,qc_var)
export(checkDataConsistency)
export(dataDescription)
export(jointQuery)
Expand All @@ -14,6 +16,7 @@ export(nhanesQuery)
export(nhanesTail)
export(phesant)
export(plot_bins2)
export(qc_var)
export(unionQuery)
export(variableMetaData)
importFrom(stats,na.omit)
Expand Down
111 changes: 111 additions & 0 deletions R/primary_keys.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@


## Manually curated list of variables that form primary keys for a
## given table. For most tables, this is SEQN.

## This list needs to be updated periodically


##' Look up primary keys for a given NHANES table
##'
##' Most NHANES tables contain a variables called SEQN that represents
##' a participant ID, and hence can be used as a primary key for
##' join operations. Some tables are in the long format, potentially
##' containing multiple observations per participant, where other
##' variables along with SEQN serve as a composite primary key. Yet
##' other tables have no participant ID, where the primary key
##' variable depends on the specific context. For some tables
##' (mostly those containing dietary components codes and drug /
##' supplement codes) the intended primary key columns are not
##' actually unique.
##' @title primary_keys: Primary keys for NHANES table
##' @param x Character string giving name of an NHANES table
##' @param require_unique Logical; whether the likely intended primary
##' key variables should be returned even if they do not uniquely
##' identify rows.
##' @return Character vector giving variables that should serve as
##' primary keys for the table. May be \code{NULL} if
##' \code{require_unique = TRUE}.
##' @author Deepayan Sarkar
primary_keys <- function(x, require_unique = FALSE)
{
stopifnot(length(x) == 1)
## For these tables, there is NO combination that can resonably
## serve as primary key, even though we can find a combination
## that almost works. These "intended" combinations are returned
## by the switch statement below, but here we keep the option to
## short-circuit that lookup and just return NULL, so that the DB
## doesn't try to set any primary keys.
exceptions <-
c("RXQANA_C", "DS1IDS_G", "DS1IDS_J", "DSQ2_B", "DSQIDS_J",
"P_RXQ_RX", "RXQ_RX_B", "DSBI", "DS1IDS_F", "DS1IDS_I",
"P_DS1IDS", "DS2IDS_E", "DSQIDS_G", "DSQIDS_I", "PAQIAF",
"PAQIAF_C", "PAQIAF_D", "RXQANA_B", "DS2IDS_F", "DS2IDS_H",
"DS2IDS_I", "DS2IDS_J", "DSQ2_C", "P_DSQIDS", "PAQIAF_B",
"RXQ_RX_C", "RXQ_RX_D", "RXQ_RX_E", "RXQ_RX_F", "RXQ_RX_G",
"DS1IDS_E", "DS1IDS_H", "P_DS2IDS", "DSQIDS_E", "DSQIDS_H",
"RXQ_RX", "RXQ_RX_H", "RXQ_RX_I", "RXQ_RX_J")
if (require_unique && x %in% exceptions) return(NULL)
switch(x,
## tables which have duplicate SEQN
## Audiometry
P_AUXAR = , AUXAR_I = , AUXAR_J = c("SEQN", "RFXSEAR", "RFXLEVEL"),
P_AUXTYM = , AUXTYM_I = , AUXTYM_J = c("SEQN", "TYXPEAR"),
P_AUXWBR = , AUXWBR_I = , AUXWBR_J = c("SEQN", "WBXFEAR"),
## Diet
P_DR1IFF = , DR1IFF_C = , DR1IFF_D = , DR1IFF_E = ,
DR1IFF_F = , DR1IFF_G = , DR1IFF_H = , DR1IFF_I = ,
DR1IFF_J = c("SEQN", "DR1ILINE"),
P_DR2IFF = , DR2IFF_C = , DR2IFF_D = , DR2IFF_E = ,
DR2IFF_F = , DR2IFF_G = , DR2IFF_H = , DR2IFF_I = ,
DR2IFF_J = c("SEQN", "DR2ILINE"),
DRXIFF = , DRXIFF_B = c("SEQN", "DRXILINE"),
## Dietary supplements
DS1IDS_E = , DS1IDS_F = , DS1IDS_G = , DS1IDS_H = , DS1IDS_I = ,
DS2IDS_E = , DS2IDS_F = , DS2IDS_G = , DS2IDS_H = , DS2IDS_I = ,
DSQ2_B = , DSQ2_C = , DSQ2_D = ,
DSQFILE2 = c("SEQN", "DSDSUPID"),
## Dietary supplements, but can be repeated (for multiple sources)
DSQIDS_E = , DSQIDS_F = , DSQIDS_G = , DSQIDS_H = ,
DSQIDS_I = c("SEQN", "DSDSUPID"),
## NCHS supplement id variable name changed in cycle J?
P_DS1IDS = , P_DS2IDS = , P_DSQIDS = , DS1IDS_J = , DS2IDS_J = ,
DSQIDS_J = c("SEQN", "DSDPID"),
## Miscellaneous
FFQDC_C = , FFQDC_D = c("SEQN", "FFQ_VAR", "FFQ_FOOD"),
PAQIAF = , PAQIAF_B = , PAQIAF_C = , PAQIAF_D = c("SEQN", "PADACTIV", "PADLEVEL"),
PAXDAY_G = , PAXDAY_H = c("SEQN", "PAXSSNDP"),
PAXHR_G = , PAXHR_H = c("SEQN", "PAXSSNHP"),
P_RXQ_RX = , RXQ_RX = , RXQ_RX_B = , RXQ_RX_C = , RXQ_RX_D = , RXQ_RX_E = ,
RXQ_RX_F = , RXQ_RX_G = , RXQ_RX_H = , RXQ_RX_I = , RXQ_RX_J = c("SEQN", "RXDDRGID"),
RXQ_ANA = c("SEQN", "RXQ310"),
RXQANA_B = , RXQANA_C = c("SEQN", "RXD310"),
SSHPV_F = c("SEQN", "SSHPTYPE"),

## these 'pooled' tables have SAMPLEID
BFRPOL_D = , BFRPOL_E = , BFRPOL_F = , BFRPOL_G = ,
BFRPOL_H = , BFRPOL_I = , DOXPOL_D = , DOXPOL_E = ,
DOXPOL_F = , DOXPOL_G = , PCBPOL_D = , PCBPOL_E = ,
PCBPOL_F = , PCBPOL_G = , PCBPOL_H = , PCBPOL_I = ,
PSTPOL_D = , PSTPOL_E = , PSTPOL_F = , PSTPOL_G = ,
PSTPOL_H = , PSTPOL_I = "SAMPLEID",

## These have neither SEQN nor SAMPLEID

P_DRXFCD = , DRXFCD_C = , DRXFCD_D = , DRXFCD_E = , DRXFCD_F = ,
DRXFCD_G = , DRXFCD_H = , DRXFCD_I = , DRXFCD_J = "DRXFDCD",
DRXFMT = , DRXFMT_B = "START",
DRXMCD_C = , DRXMCD_D = , DRXMCD_E = , DRXMCD_F = , DRXMCD_G = "DRXMC",
DSBI = c("DSDIID", "DSDBID"),
DSII = c("DSDPID", "DSDIID"),
DSPI = "DSDPID",
FOODLK_C = , FOODLK_D = "FFQ_FOOD",
PFC_POOL = c("PFCANA", "PFCRACE", "PFCGENDR", "PFCAGE", "PFCPOOL"),
RXQ_DRUG = "RXDDRGID",
SSBFR_B = , SSPCB_B = , SSPST_B = "POOLID",
VARLK_C = , VARLK_D = "FFQ_VAR",

## default
"SEQN")
}

229 changes: 229 additions & 0 deletions R/qc-codebook.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,229 @@



## QC based on codebook metadata for one variable at a time. The main
## goal is to detect and report inconsistencies in a variable within
## or across cycles. By default, the variable is looked for in all
## cycles, but specific cycles may also be specified
## (QuestionnaireDescriptions has BeginYear and EndYear for each
## table)

## Metadata: fetch in advance and subset in R, because they are not
## that big, or just get for specific variable? Unless we cache,
## second option is probably better (do once inside qc_var).



.where_clause <- function(variable = NULL, table = NULL)
{
case <- 1L + length(variable) + 2 * length(table)
switch(case,
"",
sprintf("where Variable = '%s'", variable),
sprintf("where TableName = '%s'", table),
sprintf("where Variable = '%s' and TableName = '%s'", variable, table))
}

metadata_cb <- function(variable = NULL, table = NULL)
{
nhanesQuery(paste("select * from Metadata.VariableCodebook",
.where_clause(variable, table)))
}
metadata_var <- function(variable = NULL, table = NULL)
{
nhanesQuery(paste("select * from Metadata.QuestionnaireVariables",
.where_clause(variable, table)))
}
metadata_tab <- function(table = NULL)
{
nhanesQuery(paste("select * from Metadata.QuestionnaireDescriptions",
.where_clause(NULL, table)))
}



## The specific types of discrepancies we look for are:

## - Whether appears in multiple tables in a given cycle

## If yes, should be followed up by a check of whether values are consistent

qc_var_multtable <- function(x, var, cb, tab)
{
wtable <- subset(var, Variable == x)$TableName
tsub <- subset(tab, TableName %in% wtable)
cycle <- with(tsub, paste(BeginYear, EndYear, sep = "-"))
if (anyDuplicated(cycle)) {
o <- order(cycle, tsub$TableName)
return(list(multiple_tables = data.frame(cycle = cycle[o],
TableName = tsub$TableName[o])))
}
return(NULL)
}



## - Inconsistency in Description / SasLabel (mostly benign)

qc_var_description <- function(x, var, cb, tab, ignore.case = FALSE)
{
description <- subset(var, Variable == x)[["Description"]]
if (ignore.case) description <- tolower(description)
tt <- table(description)
if (length(tt) > 1) list(description_mismatch = table(description))
else NULL
}


qc_var_saslabel <- function(x, var, cb, tab, ignore.case = FALSE)
{
saslabel <- subset(var, Variable == x)[["SasLabel"]]
if (ignore.case) saslabel <- tolower(saslabel)
tt <- table(saslabel)
if (length(tt) > 1) list(saslabel_mismatch = table(saslabel))
else NULL
}

qc_var_target <- function(x, var, cb, tab, ignore.case = FALSE)
{
target <- subset(var, Variable == x)[["Target"]]
if (ignore.case) target <- tolower(target)
tt <- table(target)
if (length(tt) > 1) list(target_mismatch = table(target))
else NULL
}



## - Inconsistency in type (numeric / categorical)

## - Inconsistency in levels for categorical variables (capitalization / other)

## - Presence of 'special' values in numeric variables, and
## inconsistency in them (including different codes for same
## value). Should have option to exclude common examples like "Don't
## know", "Refused", etc.

## - Data coarsening (this may be tricky to identify)

## - Whether variable may be skipped. This requires preparing an
## initial table-level summary.

## For variables appearing in multiple tables in the same cycle, an
## additional check could be to see if it records the same data. This
## should be a separate check, as it involves accessing the actual
## data.





##' QC report for a variable in NHANES
##'
##' @title qc_var: QC on NHANES variable
##' @param x Character string naming a variable in one or more NHANES tables
##' @param var Optional data frame containing variable metadata
##' @param cb Optional data frame containing codebook metadata
##' @param tab Optional data frame containing table metadata
##' @return An object of S3 class \code{"qc_var"} with suitable print and summary methods.
##' @export
##' @author Deepayan Sarkar
qc_var <- function(x, var = metadata_var(x), cb = metadata_cb(x), tab = metadata_tab())
{
res <- c(qc_var_multtable(x, var, cb, tab),
qc_var_description(x, var, cb, tab),
qc_var_saslabel(x, var, cb, tab),
qc_var_target(x, var, cb, tab))
if (is.null(res)) res <- list()
structure(res,
variable = x,
class = "qc_var")
}

#' @rdname qc_var
#' @export
#' @param object An object of class \code{"qv_var"}
#' @param ... Additional arguments, ignored
summary.qc_var <- function(object, ...)
{
data.frame(Variable = attr(object, "variable"),
multtable = !is.null(object$multiple_tables),
description = !is.null(object$description_mismatch),
saslabel = !is.null(object$saslabel_mismatch),
target = !is.null(object$target_mismatch))
}



#' @rdname qc_var
#' @export
print.qc_var <- function(x, ...)
{
ok <- TRUE
cat("Variable: ", attr(x, "variable"))
if (!is.null(x$multiple_tables))
{
ok <- FALSE
cat("\nAppears in multiple tables within same cycle:\n")
## wcycle <- which(duplicated(x$multiple_tables$cycle))
## wsub <- subset(x$multiple_tables, cycle %in% cycle[wcycle])
tapply(x$multiple_tables, ~ cycle, function(d) paste(d$TableName, collapse = " / ")) |>
array2DF(responseName = "Tables") |> print()
}
if (!is.null(x$description_mismatch))
{
ok <- FALSE
cat("\nMismatch in Description:\n")
print(array2DF(x$description_mismatch, responseName = "Frequency"))
}
if (!is.null(x$saslabel_mismatch))
{
ok <- FALSE
cat("\nMismatch in Saslabel:\n")
print(array2DF(x$saslabel_mismatch, responseName = "Frequency"))
}
if (!is.null(x$target_mismatch))
{
ok <- FALSE
cat("\nMismatch in Target:\n")
print(array2DF(x$target_mismatch, responseName = "Frequency"))
}
if (ok) cat(" --- no problems found")
invisible(x)
}




if (FALSE)
{
var <- metadata_var()
cb <- metadata_cb()
tab <- metadata_tab()

qc_var("PHAFSTMN", var, cb, tab)
qc_var("LBCBHC", var, cb, tab)
qc_var("ENQ100", var, cb, tab)
qc_var("LBXHCT", var, cb, tab)


system.time({
var <- metadata_var()
cb <- metadata_cb()
tab <- metadata_tab()
qc_var("LBCBHC", var, cb, tab)
})

system.time(qc_var("LBCBHC"))


qc_var("PHAFSTMN")
qc_var("ENQ100")


qc_var("LBCBHC")
qc_var("LBXHCT")


}

9 changes: 9 additions & 0 deletions R/qc-skip.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@

## Identify questions in a table that might lead to skipping, and
## which variables are potentially skipped as a result. This will need
## to assume that the order is known, which we will take from the
## codebook (make sure that's the order in the database as well).




Loading

0 comments on commit 067e24e

Please sign in to comment.