Skip to content

Commit cfebe5f

Browse files
committed
Address failing test, use dplyr_extending, don't ungroup in slides
* Fix failing grouped `epix_slide` test, where `group_modify` dropped the `epi_df` class when binding results together, by implementing `dplyr_reconstruct.epi_df`. * Implement the rest of `?dplyr_extending` and remove some now-unnecessary S3 methods for dplyr verbs, addressing #195, #223, and changing `epi[x]_slide` to leave grouping intact. * Update tests for grouped slides to reflect new behavior. * Update NEWS.md.
1 parent 6e3f554 commit cfebe5f

File tree

7 files changed

+152
-146
lines changed

7 files changed

+152
-146
lines changed

NAMESPACE

+4-7
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,22 @@
11
# Generated by roxygen2: do not edit by hand
22

33
S3method("[",epi_df)
4-
S3method(arrange,epi_df)
4+
S3method("names<-",epi_df)
55
S3method(as_epi_df,data.frame)
66
S3method(as_epi_df,epi_df)
77
S3method(as_epi_df,tbl_df)
88
S3method(as_epi_df,tbl_ts)
99
S3method(as_tsibble,epi_df)
1010
S3method(dplyr_col_modify,col_modify_recorder_df)
11-
S3method(filter,epi_df)
11+
S3method(dplyr_col_modify,epi_df)
12+
S3method(dplyr_reconstruct,epi_df)
1213
S3method(group_by,epi_archive)
1314
S3method(group_by,epi_df)
1415
S3method(group_by,grouped_epi_archive)
1516
S3method(group_by_drop_default,grouped_epi_archive)
16-
S3method(group_modify,epi_df)
17-
S3method(mutate,epi_df)
1817
S3method(next_after,Date)
1918
S3method(next_after,integer)
2019
S3method(print,epi_df)
21-
S3method(relocate,epi_df)
22-
S3method(rename,epi_df)
23-
S3method(slice,epi_df)
2420
S3method(summary,epi_df)
2521
S3method(ungroup,epi_df)
2622
S3method(unnest,epi_df)
@@ -66,6 +62,7 @@ importFrom(data.table,set)
6662
importFrom(data.table,setkeyv)
6763
importFrom(dplyr,arrange)
6864
importFrom(dplyr,dplyr_col_modify)
65+
importFrom(dplyr,dplyr_reconstruct)
6966
importFrom(dplyr,filter)
7067
importFrom(dplyr,group_by)
7168
importFrom(dplyr,group_by_drop_default)

NEWS.md

+19
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,28 @@ Note that `epiprocess` uses the [Semantic Versioning
44
("semver")](https://semver.org/) scheme for all release versions, but not for
55
development versions. A ".9999" suffix indicates a development version.
66

7+
## Breaking changes:
8+
9+
* `epi_slide` and `epix_slide` now keep the grouping of `x` in their results,
10+
like `dplyr::mutate` and `dplyr::group_modify`. To obtain the old behavior,
11+
`dplyr::ungroup` immediately after slides.
12+
13+
## Potentially-breaking changes:
14+
15+
* Fixed `[` on grouped `epi_df`s to maintain the grouping if possible when
16+
dropping the `epi_df` class (e.g., when removing the `time_value` column).
17+
* Fixed `group_modify` on `epi_df`s to decay to non-`epi_df`s when an `epi_df`
18+
result doesn't make sense (e.g., when removing the `time_value` column).
19+
20+
## Improvements:
21+
22+
* Fixed `epi_slide` and `group_modify` on grouped `epi_df`s to not drop `epi_df`
23+
class.
24+
725
## Cleanup:
826

927
* Added a `NEWS.md` file to track changes to the package.
28+
* Implemented `?dplyr::dplyr_extending` for `epi_df`s (#223).
1029

1130
# epiprocess 0.5.0:
1231

R/grouped_epi_archive.R

+57-28
Original file line numberDiff line numberDiff line change
@@ -215,24 +215,55 @@ grouped_epi_archive =
215215
# Symbolize column name
216216
new_col = sym(new_col_name)
217217

218-
# Key variable names, apart from time value and version
219-
key_vars = setdiff(key(private$ungrouped$DT), c("time_value", "version"))
218+
# Each computation is expected to output a data frame with either
219+
# one element/row total or one element/row per encountered
220+
# nongrouping, nontime, nonversion key value. These nongrouping,
221+
# nontime, nonversion key columns can be seen as the "effective" key
222+
# of the computation; the computation might return an object that
223+
# reports a different key or no key, but the "effective" key should
224+
# still be a valid unique key for the data, and is something that we
225+
# could use even with `.keep = FALSE`.
226+
comp_effective_key_vars =
227+
setdiff(key(private$ungrouped$DT),
228+
c(private$vars, "time_value", "version"))
220229

221230
# Computation for one group, one time value
222-
comp_one_grp = function(.data_group,
231+
comp_one_grp = function(.data_group, .group_key,
223232
f, ...,
224-
time_value,
225-
key_vars,
233+
ref_time_value,
234+
comp_effective_key_vars,
226235
new_col) {
227236
# Carry out the specified computation
228-
comp_value = f(.data_group, ...)
237+
comp_value = f(.data_group, .group_key, ...)
229238

230-
# Count the number of appearances of the reference time value.
231-
# Note: ideally, we want to directly count occurrences of the ref
232-
# time value but due to latency, this will often not appear in the
233-
# data group. So we count the number of unique key values, outside
234-
# of the time value column
235-
count = sum(!duplicated(.data_group[, key_vars]))
239+
# Calculate the number of output elements/rows we expect the
240+
# computation to output: one per distinct "effective computation
241+
# key variable" value encountered in the input. Note: this mirrors
242+
# how `epi_slide` does things if we're using unique keys, but can
243+
# diverge if using nonunique keys. The `epi_slide` approach of
244+
# counting occurrences of the `ref_time_value` in the `time_value`
245+
# column, which helps lines up the computation results with
246+
# corresponding rows of the input data, wouldn't quite apply here:
247+
# we'd want to line up with rows (from the same group) with
248+
# `version` matching the `ref_time_value`, but would still need to
249+
# summarize these rows somehow and drop the `time_value` input
250+
# column, but this summarization requires something like a
251+
# to-be-unique output key to determine a sensible number of rows
252+
# to output (and the contents of those rows).
253+
count =
254+
if (length(comp_effective_key_vars) != 0L) {
255+
sum(!duplicated(.data_group[, comp_effective_key_vars]))
256+
} else {
257+
# Same idea as above, but accounting for `duplicated` not
258+
# working as we want on 0 columns. (Should be the same as if
259+
# we were counting distinct values of a column defined as
260+
# `rep(val, target_n_rows)`.)
261+
if (nrow(.data_group) == 0L) {
262+
0L
263+
} else {
264+
1L
265+
}
266+
}
236267

237268
# If we get back an atomic vector
238269
if (is.atomic(comp_value)) {
@@ -241,7 +272,7 @@ grouped_epi_archive =
241272
}
242273
# If not a singleton, should be the right length, else abort
243274
else if (length(comp_value) != count) {
244-
Abort("If the slide computation returns an atomic vector, then it must have a single element, or else one element per appearance of the reference time value in the local window.")
275+
Abort('If the slide computation returns an atomic vector, then it must have either (a) a single element, or (b) one element per distinct combination of key variables, excluding the `time_value`, `version`, and grouping variables, that is present in the first argument to the computation.')
245276
}
246277
}
247278

@@ -256,7 +287,7 @@ grouped_epi_archive =
256287
}
257288
# Make into a list
258289
else {
259-
comp_value = split(comp_value, 1:nrow(comp_value))
290+
comp_value = split(comp_value, seq_len(nrow(comp_value)))
260291
}
261292
}
262293

@@ -265,10 +296,9 @@ grouped_epi_archive =
265296
Abort("The slide computation must return an atomic vector or a data frame.")
266297
}
267298

268-
# Note that we've already recycled comp value to make size stable,
269-
# so tibble() will just recycle time value appropriately
270-
return(tibble::tibble(time_value = time_value,
271-
!!new_col := comp_value))
299+
# Label every result row with the `ref_time_value`:
300+
return(tibble::tibble(time_value = rep(.env$ref_time_value, count),
301+
!!new_col := .env$comp_value))
272302
}
273303

274304
# If f is not missing, then just go ahead, slide by group
@@ -278,12 +308,12 @@ grouped_epi_archive =
278308
private$ungrouped$as_of(ref_time_value, min_time_value = ref_time_value - before) %>%
279309
dplyr::group_by(dplyr::across(tidyselect::all_of(private$vars)),
280310
.drop=private$drop) %>%
281-
dplyr::summarize(comp_one_grp(dplyr::cur_data_all(),
282-
f = f, ...,
283-
time_value = ref_time_value,
284-
key_vars = key_vars,
285-
new_col = new_col),
286-
.groups = groups)
311+
dplyr::group_modify(comp_one_grp,
312+
f = f, ...,
313+
ref_time_value = ref_time_value,
314+
comp_effective_key_vars = comp_effective_key_vars,
315+
new_col = new_col,
316+
.keep = TRUE)
287317
})
288318
}
289319

@@ -306,11 +336,10 @@ grouped_epi_archive =
306336
dplyr::group_by(dplyr::across(tidyselect::all_of(private$vars))) %>%
307337
dplyr::group_modify(comp_one_grp,
308338
f = f, quo = quo,
309-
time_value = ref_time_value,
310-
key_vars = key_vars,
339+
ref_time_value = ref_time_value,
340+
comp_effective_key_vars = comp_effective_key_vars,
311341
new_col = new_col,
312-
.keep = TRUE) %>%
313-
dplyr::ungroup()
342+
.keep = TRUE)
314343
})
315344
}
316345

R/methods-epi_df.R

+66-82
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,7 @@ as_tsibble.epi_df = function(x, key, ...) {
1919

2020
#' Base S3 methods for an `epi_df` object
2121
#'
22-
#' Print, summary, and `dplyr` verbs (that preserve class and attributes) for an
23-
#' `epi_df` object.
22+
#' Print and summary functions for an `epi_df` object.
2423
#'
2524
#' @param x The `epi_df` object.
2625
#' @param ... Additional arguments passed to methods.
@@ -65,119 +64,104 @@ summary.epi_df = function(object, ...) {
6564
dplyr::summarize(mean(.data$num)))))
6665
}
6766

67+
#' Drop any `epi_df` metadata and class on a data frame
68+
#'
69+
#' Useful in implementing `?dplyr_extending` when manipulations cause invariants
70+
#' of `epi_df`s to be violated and we need to return some other class. Note that
71+
#' this will maintain any grouping (keeping the `grouped_df` class and
72+
#' associated attributes, if present).
73+
#'
74+
#' @param x an `epi_df` or other data frame
75+
#' @return `x` with any metadata dropped and the `"epi_df"` class, if previously
76+
#' present, dropped
77+
#'
78+
#' @noRd
79+
decay_epi_df = function(x) {
80+
attributes(x)$metadata <- NULL
81+
class(x) <- class(x)[class(x) != "epi_df"]
82+
x
83+
}
84+
85+
# Implementing `dplyr_extending`: `geo_type` and `time_type` are scalar
86+
# attributes dependent on columns, and `other_keys` acts like an attribute
87+
# vectorized over columns; `dplyr_extending` advice says to implement
88+
# `dplyr_reconstruct`, 1d `[`, `dplyr_col_modify`, and `names<-`, but not
89+
# `dplyr_row_slice`. We'll implement `[` to allow either 1d or 2d.
90+
91+
#' @param data tibble or `epi_df` (`dplyr` feeds in former, but we may
92+
#' directly feed in latter from our other methods)
93+
#' @param template `epi_df` template to use to restore
94+
#' @return `epi_df` or degrade into `tbl_df`
95+
#' @importFrom dplyr dplyr_reconstruct
6896
#' @export
69-
`[.epi_df` <- function(x, i, j, drop = FALSE) {
70-
res <- NextMethod()
71-
72-
if (!is.data.frame(res)) return(res)
73-
74-
if (missing(i)) {
75-
i <- NULL
76-
}
77-
78-
if (missing(j)) {
79-
j <- NULL
80-
}
97+
#' @noRd
98+
dplyr_reconstruct.epi_df = function(data, template) {
99+
# Start from a reconstruction for the backing S3 classes; this ensures that we
100+
# keep any grouping that has been applied:
101+
res <- NextMethod()
81102

82103
cn <- names(res)
83-
104+
84105
# Duplicate columns, Abort
85106
dup_col_names = cn[duplicated(cn)]
86107
if (length(dup_col_names) != 0) {
87108
Abort(paste0("Column name(s) ",
88109
paste(unique(dup_col_names),
89110
collapse = ", "), " must not be duplicated."))
90-
}
111+
}
91112

92113
not_epi_df <- !("time_value" %in% cn) || !("geo_value" %in% cn)
93114

94115
if (not_epi_df) {
95-
attributes(res)$metadata <- NULL
96-
return(tibble::as_tibble(res))
116+
# If we're calling on an `epi_df` from one of our own functions, we need to
117+
# decay to a non-`epi_df` result. If `dplyr` is calling, `x` is a tibble,
118+
# `res` is not an `epi_df` yet (but might, e.g., be a `grouped_df`), and we
119+
# simply need to skip adding the metadata & class. Current `decay_epi_df`
120+
# should work in both cases.
121+
return(decay_epi_df(res))
97122
}
98123

99-
# Use reclass as safeguard (in case class &/or metadata are dropped)
100-
res <- reclass(res, attr(x, "metadata"))
124+
res <- reclass(res, attr(template, "metadata"))
125+
126+
# XXX we may want verify the `geo_type` and `time_type` here. If it's
127+
# significant overhead, we may also want to keep this less strict version
128+
# around and implement some extra S3 methods that use it, when appropriate.
101129

102130
# Amend additional metadata if some other_keys cols are dropped in the subset
103-
old_other_keys = attr(x, "metadata")$other_keys
131+
old_other_keys = attr(template, "metadata")$other_keys
104132
attr(res, "metadata")$other_keys <- old_other_keys[old_other_keys %in% cn]
105133

106134
res
107135
}
108136

109-
#' `dplyr` verbs
110-
#'
111-
#' `dplyr` verbs for `epi_df` objects, preserving class and attributes.
112-
#'
113-
#' @method arrange epi_df
114-
#' @param .data The `epi_df` object.
115-
#' @rdname print.epi_df
116-
#' @export
117-
arrange.epi_df = function(.data, ...) {
118-
metadata = attributes(.data)$metadata
119-
.data = NextMethod()
120-
reclass(.data, metadata)
121-
}
122-
123-
#' @method filter epi_df
124-
#' @rdname print.epi_df
125-
#' @export
126-
filter.epi_df = function(.data, ...) {
127-
metadata = attributes(.data)$metadata
128-
.data = NextMethod()
129-
reclass(.data, metadata)
130-
}
131-
132-
#' @method group_by epi_df
133-
#' @rdname print.epi_df
134137
#' @export
135-
group_by.epi_df = function(.data, ...) {
136-
metadata = attributes(.data)$metadata
137-
.data = NextMethod()
138-
reclass(.data, metadata)
139-
}
140-
141-
#' @method group_modify epi_df
142-
#' @rdname print.epi_df
143-
#' @export
144-
group_modify.epi_df = function(.data, ...) {
145-
metadata = attributes(.data)$metadata
146-
.data = NextMethod()
147-
reclass(.data, metadata)
148-
}
149-
150-
#' @method mutate epi_df
151-
#' @rdname print.epi_df
152-
#' @export
153-
mutate.epi_df = function(.data, ...) {
154-
metadata = attributes(.data)$metadata
155-
.data = NextMethod()
156-
reclass(.data, metadata)
138+
`[.epi_df` <- function(x, i, j, drop = FALSE) {
139+
res <- NextMethod()
140+
141+
if (!is.data.frame(res)) return(res)
142+
143+
dplyr::dplyr_reconstruct(res, x)
157144
}
158145

159-
#' @method relocate epi_df
160-
#' @rdname print.epi_df
146+
#' @importFrom dplyr dplyr_col_modify
161147
#' @export
162-
relocate.epi_df = function(.data, ...) {
163-
metadata = attributes(.data)$metadata
164-
.data = NextMethod()
165-
reclass(.data, metadata)
148+
dplyr_col_modify.epi_df = function(data, cols) {
149+
dplyr::dplyr_reconstruct(NextMethod(), data)
166150
}
167151

168-
#' @method rename epi_df
169-
#' @rdname print.epi_df
170152
#' @export
171-
rename.epi_df = function(.data, ...) {
172-
metadata = attributes(.data)$metadata
173-
.data = NextMethod()
174-
reclass(.data, metadata)
153+
`names<-.epi_df` = function(x, value) {
154+
old_names = names(x)
155+
old_other_keys = attributes(x)$metadata$other_keys
156+
result = NextMethod()
157+
attributes(x)$metadata$other_keys <- value[match(old_other_keys, old_names)]
158+
dplyr::dplyr_reconstruct(result, result)
175159
}
176160

177-
#' @method slice epi_df
161+
#' @method group_by epi_df
178162
#' @rdname print.epi_df
179163
#' @export
180-
slice.epi_df = function(.data, ...) {
164+
group_by.epi_df = function(.data, ...) {
181165
metadata = attributes(.data)$metadata
182166
.data = NextMethod()
183167
reclass(.data, metadata)

0 commit comments

Comments
 (0)