Merge pull request #196 from cmu-delphi/192-to-194-update_epi_df_indexing_fun

rachlobay · web-flow · commit c5cfb5e74564 · 2022-08-11T17:24:05.000-07:00
Addressed issues 192-194
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -37,8 +37,7 @@ Imports:
     tidyr,
     tidyselect,
     tsibble,
-    utils,
-    vctrs
+    utils
 Suggests:
     covidcast,
     epidatr,
@@ -47,6 +46,7 @@ Suggests:
     outbreaks,
     rmarkdown,
     testthat (>= 3.0.0),
+    vctrs,
     waldo (>= 0.3.1),
     withr
 VignetteBuilder: 
diff --git a/R/methods-epi_df.R b/R/methods-epi_df.R
@@ -71,22 +71,37 @@ summary.epi_df = function(object, ...) {
   
   if (missing(i)) {
     i <- NULL
-    i_arg <- NULL
   } 
   
   if (missing(j)) {
     j <- NULL
-    j_arg <- NULL
   } 
   
   cn <- names(res)
-  nr <- vctrs::vec_size(x) 
-  not_epi_df <- (!("time_value" %in% cn) || !("geo_value" %in% cn) || vctrs::vec_size(res) > nr || any(i > nr))
   
-  if (not_epi_df) return(tibble::as_tibble(res))
+  # Duplicate columns, Abort
+  dup_col_names = cn[duplicated(cn)]
+  if (length(dup_col_names) != 0) {
+    Abort(paste0("Column name(s) ", 
+                 paste(unique(dup_col_names), 
+                       collapse = ", "), " must not be duplicated."))
+  } 
+  
+  not_epi_df <- !("time_value" %in% cn) || !("geo_value" %in% cn)
+  
+  if (not_epi_df) {
+    attributes(res)$metadata <- NULL 
+    return(tibble::as_tibble(res))
+  }
+  
+  # Use reclass as safeguard (in case class &/or metadata are dropped)
+  res <- reclass(res, attr(x, "metadata"))
+  
+  # Amend additional metadata if some other_keys cols are dropped in the subset
+  old_other_keys = attr(x, "metadata")$other_keys
+  attr(res, "metadata")$other_keys <- old_other_keys[old_other_keys %in% cn]
   
-  # Use reclass as safeguard (in case class & metadata are dropped)
-  reclass(res, attr(x, "metadata"))
+  res
 }
 
 #' `dplyr` verbs
diff --git a/tests/testthat/test-methods-epi_df.R b/tests/testthat/test-methods-epi_df.R
@@ -7,12 +7,14 @@ toy_epi_df <- tibble::tibble(
     length.out = 5
   ), times = 2),
   geo_value = rep(c("ca", "hi"), each = 5),
-  indicator_var = as.factor(rep(1:2, times = 5)), 
-) %>% as_epi_df(additional_metadata = list(other_keys = "indicator_var"))
+  indic_var1 = as.factor(rep(1:2, times = 5)), 
+  indic_var2 = as.factor(rep(letters[1:5], times = 2))
+) %>% as_epi_df(additional_metadata = 
+                  list(other_keys = c("indic_var1", "indic_var2")))
 
 att_toy = attr(toy_epi_df, "metadata")
 
-test_that("head and tail do not drop the epi_df class", {
+test_that("Head and tail do not drop the epi_df class", {
   att_head = attr(head(toy_epi_df), "metadata")
   att_tail = attr(tail(toy_epi_df), "metadata")
   
@@ -29,35 +31,43 @@ test_that("head and tail do not drop the epi_df class", {
 })
 
 
-test_that("subsetting drops or does not drop the epi_df class appropriately", {
+test_that("Subsetting drops & does not drop the epi_df class appropriately", {
   
   # Row subset - should be epi_df
   row_subset = toy_epi_df[1:2, ]
   att_row_subset = attr(row_subset, "metadata")
   
   expect_true(is_epi_df(row_subset))
   expect_equal(nrow(row_subset), 2L)
-  expect_equal(ncol(row_subset), 5L)
+  expect_equal(ncol(row_subset), 6L)
   expect_identical(att_row_subset$geo_type, att_toy$geo_type)
   expect_identical(att_row_subset$time_type, att_toy$time_type)
   expect_identical(att_row_subset$as_of, att_toy$as_of)
   expect_identical(att_row_subset$other_keys, att_toy$other_keys)
   
-  # Col subset - shouldn't be an epi_df
-  col_subset = toy_epi_df[, 2:3]
-  
-  expect_false(is_epi_df(col_subset))
-  expect_true(tibble::is_tibble(col_subset))
-  expect_equal(nrow(col_subset), 10L)
-  expect_equal(ncol(col_subset), 2L)
-  
   # Row and col single value - shouldn't be an epi_df
   row_col_subset1 = toy_epi_df[1,2]
   expect_false(is_epi_df(row_col_subset1))
   expect_true(tibble::is_tibble(row_col_subset1))
   expect_equal(nrow(row_col_subset1), 1L)
   expect_equal(ncol(row_col_subset1), 1L)
   
+  # Col subset with no time_value - shouldn't be an epi_df
+  col_subset1 = toy_epi_df[, c(1,3)]
+  
+  expect_false(is_epi_df(col_subset1))
+  expect_true(tibble::is_tibble(col_subset1))
+  expect_equal(nrow(col_subset1), 10L)
+  expect_equal(ncol(col_subset1), 2L)
+  
+  # Col subset with no geo_value - shouldn't be an epi_df
+  col_subset2 = toy_epi_df[, 2:3]
+  
+  expect_false(is_epi_df(col_subset2))
+  expect_true(tibble::is_tibble(col_subset2))
+  expect_equal(nrow(col_subset2), 10L)
+  expect_equal(ncol(col_subset2), 2L)
+  
   # Row and col subset that contains geo_value and time_value - should be epi_df
   row_col_subset2 = toy_epi_df[2:3,1:3]
   att_row_col_subset2 = attr(row_col_subset2, "metadata")
@@ -68,6 +78,41 @@ test_that("subsetting drops or does not drop the epi_df class appropriately", {
   expect_identical(att_row_col_subset2$geo_type, att_toy$geo_type)
   expect_identical(att_row_col_subset2$time_type, att_toy$time_type)
   expect_identical(att_row_col_subset2$as_of, att_toy$as_of)
-  expect_identical(att_row_col_subset2$other_keys, att_toy$other_keys)
+  expect_identical(att_row_col_subset2$other_keys, character(0))
+})
+
+test_that("When duplicate cols in subset should abort", {
+  expect_error(toy_epi_df[, c(2,2:3,4,4,4)], 
+               "Column name(s) time_value, y must not be duplicated.", fixed = T)
+  expect_error(toy_epi_df[1:4, c(1,2:4,1)], 
+               "Column name(s) geo_value must not be duplicated.", fixed = T)
+})
+
+test_that("Correct metadata when subset includes some of other_keys", {
+  # Only include other_var of indic_var1
+  only_indic_var1 = toy_epi_df[, 1:5]
+  att_only_indic_var1 = attr(only_indic_var1, "metadata")
+  
+  expect_true(is_epi_df(only_indic_var1))
+  expect_equal(nrow(only_indic_var1), 10L)
+  expect_equal(ncol(only_indic_var1), 5L)
+  expect_identical(att_only_indic_var1$geo_type, att_toy$geo_type)
+  expect_identical(att_only_indic_var1$time_type, att_toy$time_type)
+  expect_identical(att_only_indic_var1$as_of, att_toy$as_of)
+  expect_identical(att_only_indic_var1$other_keys, att_toy$other_keys[-2])
+  
+  # Only include other_var of indic_var2
+  only_indic_var2 = toy_epi_df[, c(1:4,6)]
+  att_only_indic_var2 = attr(only_indic_var2, "metadata")
   
-})
+  expect_true(is_epi_df(only_indic_var2))
+  expect_equal(nrow(only_indic_var2), 10L)
+  expect_equal(ncol(only_indic_var2), 5L)
+  expect_identical(att_only_indic_var2$geo_type, att_toy$geo_type)
+  expect_identical(att_only_indic_var2$time_type, att_toy$time_type)
+  expect_identical(att_only_indic_var2$as_of, att_toy$as_of)
+  expect_identical(att_only_indic_var2$other_keys, att_toy$other_keys[-1])
+  
+  # Including both original other_keys was already tested above
+})
+