Skip to content

Commit c73b6c2

Browse files
committed
refactor: key_colnames and others
* key_colnames order change * replace kill_time_value with exclude arg in key_colnames * move duplicate time_values check in epi_slide
1 parent dd19428 commit c73b6c2

10 files changed

+78
-62
lines changed

R/autoplot.R

+1-1
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ autoplot.epi_df <- function(
5555

5656
key_cols <- key_colnames(object)
5757
non_key_cols <- setdiff(names(object), key_cols)
58-
geo_and_other_keys <- kill_time_value(key_cols)
58+
geo_and_other_keys <- key_colnames(object, exclude = "time_value")
5959

6060
# --- check for numeric variables
6161
allowed <- purrr::map_lgl(object[non_key_cols], is.numeric)

R/epi_df.R

+4-7
Original file line numberDiff line numberDiff line change
@@ -184,18 +184,14 @@ new_epi_df <- function(x = tibble::tibble(geo_value = character(), time_value =
184184
metadata$other_keys <- other_keys
185185

186186
# Reorder columns (geo_value, time_value, ...)
187-
if (sum(dim(x)) != 0) {
188-
cols_to_put_first <- c("geo_value", "time_value", other_keys)
189-
x <- x[, c(
190-
cols_to_put_first,
191-
# All other columns
192-
names(x)[!(names(x) %in% cols_to_put_first)]
193-
)]
187+
if (nrow(x) > 0) {
188+
x <- x %>% relocate(all_of(c("geo_value", other_keys, "time_value")), .before = 1)
194189
}
195190

196191
# Apply epi_df class, attach metadata, and return
197192
class(x) <- c("epi_df", class(x))
198193
attributes(x)$metadata <- metadata
194+
199195
return(x)
200196
}
201197

@@ -281,6 +277,7 @@ as_epi_df.tbl_df <- function(
281277
if (".time_value_counts" %in% other_keys) {
282278
cli_abort("as_epi_df: `other_keys` can't include \".time_value_counts\"")
283279
}
280+
284281
duplicated_time_values <- x %>%
285282
group_by(across(all_of(c("geo_value", "time_value", other_keys)))) %>%
286283
filter(dplyr::n() > 1) %>%

R/grouped_epi_archive.R

+2-2
Original file line numberDiff line numberDiff line change
@@ -397,8 +397,8 @@ epix_slide.grouped_epi_archive <- function(
397397
)),
398398
capture.output(print(waldo::compare(
399399
res[[comp_nms[[comp_i]]]], comp_value[[comp_i]],
400-
x_arg = rlang::expr_deparse(expr(`$`(label, !!sym(comp_nms[[comp_i]])))),
401-
y_arg = rlang::expr_deparse(expr(`$`(comp_value, !!sym(comp_nms[[comp_i]]))))
400+
x_arg = rlang::expr_deparse(dplyr::expr(`$`(label, !!sym(comp_nms[[comp_i]])))), # nolint: object_usage_linter
401+
y_arg = rlang::expr_deparse(dplyr::expr(`$`(comp_value, !!sym(comp_nms[[comp_i]]))))
402402
))),
403403
cli::format_message(c(
404404
"You likely want to rename or remove this column in your output, or debug why it has a different value."

R/key_colnames.R

+10-11
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#'
33
#' @param x a data.frame, tibble, or epi_df
44
#' @param ... additional arguments passed on to methods
5+
#' @param exclude an optional character vector of keys to exclude
56
#'
67
#' @return If an `epi_df`, this returns all "keys". Otherwise `NULL`
78
#' @keywords internal
@@ -16,25 +17,23 @@ key_colnames.default <- function(x, ...) {
1617
}
1718

1819
#' @export
19-
key_colnames.data.frame <- function(x, other_keys = character(0L), ...) {
20+
key_colnames.data.frame <- function(x, other_keys = character(0L), exclude = character(0L), ...) {
2021
assert_character(other_keys)
21-
nm <- c("geo_value", "time_value", other_keys)
22+
assert_character(exclude)
23+
nm <- setdiff(c("geo_value", other_keys, "time_value"), exclude)
2224
intersect(nm, colnames(x))
2325
}
2426

2527
#' @export
26-
key_colnames.epi_df <- function(x, ...) {
28+
key_colnames.epi_df <- function(x, exclude = character(0L), ...) {
29+
assert_character(exclude)
2730
other_keys <- attr(x, "metadata")$other_keys
28-
c("geo_value", "time_value", other_keys)
31+
setdiff(c("geo_value", other_keys, "time_value"), exclude)
2932
}
3033

3134
#' @export
32-
key_colnames.epi_archive <- function(x, ...) {
35+
key_colnames.epi_archive <- function(x, exclude = character(0L), ...) {
36+
assert_character(exclude)
3337
other_keys <- attr(x, "metadata")$other_keys
34-
c("geo_value", "time_value", other_keys)
35-
}
36-
37-
kill_time_value <- function(v) {
38-
assert_character(v)
39-
v[v != "time_value"]
38+
setdiff(c("geo_value", other_keys, "time_value"), exclude)
4039
}

R/methods-epi_df.R

+13-10
Original file line numberDiff line numberDiff line change
@@ -41,10 +41,13 @@ as_tibble.epi_df <- function(x, ...) {
4141
#' @export
4242
as_tsibble.epi_df <- function(x, key, ...) {
4343
if (missing(key)) key <- c("geo_value", attributes(x)$metadata$other_keys)
44-
return(as_tsibble(tibble::as_tibble(x),
45-
key = tidyselect::all_of(key), index = "time_value",
46-
...
47-
))
44+
return(
45+
as_tsibble(
46+
tibble::as_tibble(x),
47+
key = tidyselect::all_of(key), index = "time_value",
48+
...
49+
)
50+
)
4851
}
4952

5053
#' Base S3 methods for an `epi_df` object
@@ -150,10 +153,10 @@ dplyr_reconstruct.epi_df <- function(data, template) {
150153
# keep any grouping that has been applied:
151154
res <- NextMethod()
152155

153-
cn <- names(res)
156+
col_names <- names(res)
154157

155158
# Duplicate columns, cli_abort
156-
dup_col_names <- cn[duplicated(cn)]
159+
dup_col_names <- col_names[duplicated(col_names)]
157160
if (length(dup_col_names) != 0) {
158161
cli_abort(c(
159162
"Duplicate column names are not allowed",
@@ -163,7 +166,7 @@ dplyr_reconstruct.epi_df <- function(data, template) {
163166
))
164167
}
165168

166-
not_epi_df <- !("time_value" %in% cn) || !("geo_value" %in% cn)
169+
not_epi_df <- !("time_value" %in% col_names) || !("geo_value" %in% col_names)
167170

168171
if (not_epi_df) {
169172
# If we're calling on an `epi_df` from one of our own functions, we need to
@@ -182,7 +185,7 @@ dplyr_reconstruct.epi_df <- function(data, template) {
182185

183186
# Amend additional metadata if some other_keys cols are dropped in the subset
184187
old_other_keys <- attr(template, "metadata")$other_keys
185-
attr(res, "metadata")$other_keys <- old_other_keys[old_other_keys %in% cn]
188+
attr(res, "metadata")$other_keys <- old_other_keys[old_other_keys %in% col_names]
186189

187190
res
188191
}
@@ -425,8 +428,8 @@ arrange_col_canonical.epi_df <- function(x, ...) {
425428
}
426429

427430
#' @export
428-
group_epi_df <- function(x) {
429-
cols <- kill_time_value(key_colnames(x))
431+
group_epi_df <- function(x, exclude = character()) {
432+
cols <- key_colnames(x, exclude = exclude)
430433
x %>% group_by(across(all_of(cols)))
431434
}
432435

R/slide.R

+29-12
Original file line numberDiff line numberDiff line change
@@ -122,8 +122,7 @@ epi_slide <- function(
122122
assert_class(.x, "epi_df")
123123
if (checkmate::test_class(.x, "grouped_df")) {
124124
expected_group_keys <- .x %>%
125-
key_colnames() %>%
126-
kill_time_value() %>%
125+
key_colnames(exclude = "time_value") %>%
127126
sort()
128127
if (!identical(.x %>% group_vars() %>% sort(), expected_group_keys)) {
129128
cli_abort(
@@ -134,12 +133,11 @@ epi_slide <- function(
134133
)
135134
}
136135
} else {
137-
.x <- group_epi_df(.x)
136+
.x <- group_epi_df(.x, exclude = "time_value")
138137
}
139138
if (nrow(.x) == 0L) {
140139
return(.x)
141140
}
142-
143141
# If `.f` is missing, interpret ... as an expression for tidy evaluation
144142
if (missing(.f)) {
145143
used_data_masking <- TRUE
@@ -191,6 +189,20 @@ epi_slide <- function(
191189

192190
assert_logical(.all_rows, len = 1)
193191

192+
# Check for duplicated time values within groups
193+
duplicated_time_values <- .x %>%
194+
group_epi_df() %>%
195+
filter(dplyr::n() > 1) %>%
196+
ungroup()
197+
if (nrow(duplicated_time_values) > 0) {
198+
bad_data <- capture.output(duplicated_time_values)
199+
cli_abort(
200+
"as_epi_df: some groups in a resulting dplyr computation have duplicated time values.
201+
epi_df requires a unique time_value per group.",
202+
body = c("Sample groups:", bad_data)
203+
)
204+
}
205+
194206
# Begin handling completion. This will create a complete time index between
195207
# the smallest and largest time values in the data. This is used to ensure
196208
# that the slide function is called with a complete window of data. Each slide
@@ -241,7 +253,7 @@ epi_slide <- function(
241253
.keep = TRUE
242254
) %>%
243255
bind_rows() %>%
244-
filter(.data$.real) %>%
256+
filter(.real) %>%
245257
select(-.real) %>%
246258
arrange_col_canonical() %>%
247259
group_by(!!!.x_groups)
@@ -275,11 +287,16 @@ epi_slide_one_group <- function(
275287
missing_times <- all_dates[!(all_dates %in% .data_group$time_value)]
276288
.data_group <- bind_rows(
277289
.data_group,
278-
tibble(time_value = c(
279-
missing_times,
280-
.date_seq_list$pad_early_dates,
281-
.date_seq_list$pad_late_dates
282-
), .real = FALSE)
290+
dplyr::bind_cols(
291+
.group_key,
292+
tibble(
293+
time_value = c(
294+
missing_times,
295+
.date_seq_list$pad_early_dates,
296+
.date_seq_list$pad_late_dates
297+
), .real = FALSE
298+
)
299+
)
283300
) %>%
284301
arrange(.data$time_value)
285302

@@ -405,8 +422,8 @@ epi_slide_one_group <- function(
405422
)),
406423
capture.output(print(waldo::compare(
407424
res[[comp_nms[[comp_i]]]], slide_values[[comp_i]],
408-
x_arg = rlang::expr_deparse(expr(`$`(existing, !!sym(comp_nms[[comp_i]])))),
409-
y_arg = rlang::expr_deparse(expr(`$`(comp_value, !!sym(comp_nms[[comp_i]]))))
425+
x_arg = rlang::expr_deparse(dplyr::expr(`$`(existing, !!sym(comp_nms[[comp_i]])))), # nolint: object_usage_linter
426+
y_arg = rlang::expr_deparse(dplyr::expr(`$`(comp_value, !!sym(comp_nms[[comp_i]])))) # nolint: object_usage_linter
410427
))),
411428
cli::format_message(c(
412429
">" = "You likely want to rename or remove this column from your slide

man/key_colnames.Rd

+2
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tests/testthat/test-arrange-canonical.R

+7-8
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,13 @@ test_that("canonical arrangement works", {
88
expect_error(arrange_canonical(tib))
99

1010
tib <- tib %>% as_epi_df(other_keys = "demo_grp")
11-
expect_equal(names(tib), c("geo_value", "time_value", "demo_grp", "x"))
11+
expect_equal(names(tib), c("geo_value", "demo_grp", "time_value", "x"))
1212

13-
tib_cols_shuffled <- tib %>% select(geo_value, time_value, x, demo_grp)
14-
15-
tib_sorted <- arrange_canonical(tib_cols_shuffled)
16-
expect_equal(names(tib_sorted), c("geo_value", "time_value", "demo_grp", "x"))
13+
tib_sorted <- tib %>%
14+
arrange_canonical()
15+
expect_equal(names(tib_sorted), c("geo_value", "demo_grp", "time_value", "x"))
1716
expect_equal(tib_sorted$geo_value, rep(c("ca", "ga"), each = 4))
18-
expect_equal(tib_sorted$time_value, c(1, 1, 2, 2, 1, 1, 2, 2))
19-
expect_equal(tib_sorted$demo_grp, rep(letters[1:2], times = 4))
20-
expect_equal(tib_sorted$x, c(8, 6, 7, 5, 4, 2, 3, 1))
17+
expect_equal(tib_sorted$time_value, c(1, 2, 1, 2, 1, 2, 1, 2))
18+
expect_equal(tib_sorted$demo_grp, c("a", "a", "b", "b", "a", "a", "b", "b"))
19+
expect_equal(tib_sorted$x, c(8, 7, 6, 5, 4, 3, 2, 1))
2120
})

tests/testthat/test-epi_slide.R

+5-5
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ get_test_dataset <- function(n, time_type = "day", other_keys = FALSE) {
5353
}
5454
df %>%
5555
arrange_canonical() %>%
56-
group_epi_df()
56+
group_epi_df(exclude = "time_value")
5757
}
5858
test_data <- get_test_dataset(num_rows_per_group, "day")
5959

@@ -82,10 +82,10 @@ epi_slide_sum_test <- function(
8282

8383
.x %>%
8484
mutate(.real = TRUE) %>%
85-
group_epi_df() %>%
85+
group_epi_df(exclude = "time_value") %>%
8686
complete(time_value = vctrs::vec_c(!!!date_seq_list, .name_spec = rlang::zap())) %>%
8787
arrange_canonical() %>%
88-
group_epi_df() %>%
88+
group_epi_df(exclude = "time_value") %>%
8989
mutate(
9090
slide_value = slider::slide_index_sum(
9191
.data$value,
@@ -246,7 +246,7 @@ for (p in (param_combinations %>% transpose())) {
246246
mutate(slide_value = list(slide_value)) %>%
247247
ungroup() %>%
248248
as_epi_df(as_of = attr(test_data, "metadata")$as_of, other_keys = attr(test_data, "metadata")$other_keys) %>%
249-
group_epi_df()
249+
group_epi_df(exclude = "time_value")
250250

251251
expect_equal(
252252
out %>% select(-slide_value),
@@ -268,7 +268,7 @@ for (p in (param_combinations %>% transpose())) {
268268
mutate(slide_value = list(slide_value)) %>%
269269
ungroup() %>%
270270
as_epi_df(as_of = attr(test_data, "metadata")$as_of, other_keys = attr(test_data, "metadata")$other_keys) %>%
271-
group_epi_df()
271+
group_epi_df(exclude = "time_value")
272272
expect_equal(
273273
out %>% select(-slide_value),
274274
expected_out %>% select(-slide_value)

tests/testthat/test-methods-epi_df.R

+5-6
Original file line numberDiff line numberDiff line change
@@ -69,21 +69,20 @@ test_that("Subsetting drops & does not drop the epi_df class appropriately", {
6969
expect_equal(ncol(col_subset2), 2L)
7070

7171
# Row and col subset that contains geo_value and time_value - should be epi_df
72-
row_col_subset2 <- toy_epi_df[2:3, 1:3]
72+
row_col_subset2 <- toy_epi_df[2:3, c(1, 4)]
7373
att_row_col_subset2 <- attr(row_col_subset2, "metadata")
7474

7575
expect_true(is_epi_df(row_col_subset2))
7676
expect_equal(nrow(row_col_subset2), 2L)
77-
expect_equal(ncol(row_col_subset2), 3L)
77+
expect_equal(ncol(row_col_subset2), 2L)
7878
expect_identical(att_row_col_subset2$geo_type, att_toy$geo_type)
7979
expect_identical(att_row_col_subset2$time_type, att_toy$time_type)
8080
expect_identical(att_row_col_subset2$as_of, att_toy$as_of)
81-
expect_identical(att_row_col_subset2$other_keys, att_toy$other_keys[1])
8281
})
8382

8483
test_that("When duplicate cols in subset should abort", {
8584
expect_error(toy_epi_df[, c(2, 2:3, 4, 4, 4)],
86-
"Duplicated column names: time_value, indic_var2",
85+
"Duplicated column names: indic_var1, time_value",
8786
fixed = TRUE
8887
)
8988
expect_error(toy_epi_df[1:4, c(1, 2:4, 1)],
@@ -94,7 +93,7 @@ test_that("When duplicate cols in subset should abort", {
9493

9594
test_that("Correct metadata when subset includes some of other_keys", {
9695
# Only include other_var of indic_var1
97-
only_indic_var1 <- toy_epi_df[, c(1:3, 5:6)]
96+
only_indic_var1 <- toy_epi_df[, c(1:2, 4:6)]
9897
att_only_indic_var1 <- attr(only_indic_var1, "metadata")
9998

10099
expect_true(is_epi_df(only_indic_var1))
@@ -106,7 +105,7 @@ test_that("Correct metadata when subset includes some of other_keys", {
106105
expect_identical(att_only_indic_var1$other_keys, att_toy$other_keys[-2])
107106

108107
# Only include other_var of indic_var2
109-
only_indic_var2 <- toy_epi_df[, c(1:2, 4:6)]
108+
only_indic_var2 <- toy_epi_df[, c(1, 3:6)]
110109
att_only_indic_var2 <- attr(only_indic_var2, "metadata")
111110

112111
expect_true(is_epi_df(only_indic_var2))

0 commit comments

Comments
 (0)