Skip to content

Commit 4ca96b0

Browse files
committed
Half-fix failing test, use dplyr_extending, don't ungroup in slides
* Half-fix failing grouped `epix_slide` test, where `group_modify` dropped the `epi_df` class when binding results together, by implementing `dplyr_reconstruct.epi_df`. Somehow a session documenting and testing immediately before this commit and then after it will have the tests pass, but the tests won't pass off of this branch directly. The class vector involved in the failure does appear to change. * Explicate `epix_slide` `count` derivation. * Implement the rest of `?dplyr_extending` and remove some now-unnecessary S3 methods for dplyr verbs, addressing #195, #223, and changing `epi[x]_slide` to leave grouping intact. * Update tests for grouped slides to reflect new behavior. * Update NEWS.md.
1 parent 6e3f554 commit 4ca96b0

File tree

7 files changed

+152
-146
lines changed

7 files changed

+152
-146
lines changed

NAMESPACE

+4-7
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,22 @@
11
# Generated by roxygen2: do not edit by hand
22

33
S3method("[",epi_df)
4-
S3method(arrange,epi_df)
4+
S3method("names<-",epi_df)
55
S3method(as_epi_df,data.frame)
66
S3method(as_epi_df,epi_df)
77
S3method(as_epi_df,tbl_df)
88
S3method(as_epi_df,tbl_ts)
99
S3method(as_tsibble,epi_df)
1010
S3method(dplyr_col_modify,col_modify_recorder_df)
11-
S3method(filter,epi_df)
11+
S3method(dplyr_col_modify,epi_df)
12+
S3method(dplyr_reconstruct,epi_df)
1213
S3method(group_by,epi_archive)
1314
S3method(group_by,epi_df)
1415
S3method(group_by,grouped_epi_archive)
1516
S3method(group_by_drop_default,grouped_epi_archive)
16-
S3method(group_modify,epi_df)
17-
S3method(mutate,epi_df)
1817
S3method(next_after,Date)
1918
S3method(next_after,integer)
2019
S3method(print,epi_df)
21-
S3method(relocate,epi_df)
22-
S3method(rename,epi_df)
23-
S3method(slice,epi_df)
2420
S3method(summary,epi_df)
2521
S3method(ungroup,epi_df)
2622
S3method(unnest,epi_df)
@@ -66,6 +62,7 @@ importFrom(data.table,set)
6662
importFrom(data.table,setkeyv)
6763
importFrom(dplyr,arrange)
6864
importFrom(dplyr,dplyr_col_modify)
65+
importFrom(dplyr,dplyr_reconstruct)
6966
importFrom(dplyr,filter)
7067
importFrom(dplyr,group_by)
7168
importFrom(dplyr,group_by_drop_default)

NEWS.md

+19
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,28 @@ Note that `epiprocess` uses the [Semantic Versioning
44
("semver")](https://semver.org/) scheme for all release versions, but not for
55
development versions. A ".9999" suffix indicates a development version.
66

7+
## Breaking changes:
8+
9+
* `epi_slide` and `epix_slide` now keep the grouping of `x` in their results,
10+
like `dplyr::mutate` and `dplyr::group_modify`. To obtain the old behavior,
11+
`dplyr::ungroup` immediately after slides.
12+
13+
## Potentially-breaking changes:
14+
15+
* Fixed `[` on grouped `epi_df`s to maintain the grouping if possible when
16+
dropping the `epi_df` class (e.g., when removing the `time_value` column).
17+
* Fixed `group_modify` on `epi_df`s to decay to non-`epi_df`s when an `epi_df`
18+
result doesn't make sense (e.g., when removing the `time_value` column).
19+
20+
## Improvements:
21+
22+
* Fixed `epi_slide` and `group_modify` on grouped `epi_df`s to not drop `epi_df`
23+
class.
24+
725
## Cleanup:
826

927
* Added a `NEWS.md` file to track changes to the package.
28+
* Implemented `?dplyr::dplyr_extending` for `epi_df`s (#223).
1029

1130
# epiprocess 0.5.0:
1231

R/grouped_epi_archive.R

+57-28
Original file line numberDiff line numberDiff line change
@@ -215,24 +215,55 @@ grouped_epi_archive =
215215
# Symbolize column name
216216
new_col = sym(new_col_name)
217217

218-
# Key variable names, apart from time value and version
219-
key_vars = setdiff(key(private$ungrouped$DT), c("time_value", "version"))
218+
# Each computation is expected to output a data frame with either
219+
# one element/row total or one element/row per encountered
220+
# nongrouping, nontime, nonversion key value. These nongrouping,
221+
# nontime, nonversion key columns can be seen as the "effective" key
222+
# of the computation; the computation might return an object that
223+
# reports a different key or no key, but the "effective" key should
224+
# still be a valid unique key for the data, and is something that we
225+
# could use even with `.keep = FALSE`.
226+
comp_effective_key_vars =
227+
setdiff(key(private$ungrouped$DT),
228+
c(private$vars, "time_value", "version"))
220229

221230
# Computation for one group, one time value
222-
comp_one_grp = function(.data_group,
231+
comp_one_grp = function(.data_group, .group_key,
223232
f, ...,
224-
time_value,
225-
key_vars,
233+
ref_time_value,
234+
comp_effective_key_vars,
226235
new_col) {
227236
# Carry out the specified computation
228-
comp_value = f(.data_group, ...)
237+
comp_value = f(.data_group, .group_key, ...)
229238

230-
# Count the number of appearances of the reference time value.
231-
# Note: ideally, we want to directly count occurrences of the ref
232-
# time value but due to latency, this will often not appear in the
233-
# data group. So we count the number of unique key values, outside
234-
# of the time value column
235-
count = sum(!duplicated(.data_group[, key_vars]))
239+
# Calculate the number of output elements/rows we expect the
240+
# computation to output: one per distinct "effective computation
241+
# key variable" value encountered in the input. Note: this mirrors
242+
# how `epi_slide` does things if we're using unique keys, but can
243+
# diverge if using nonunique keys. The `epi_slide` approach of
244+
# counting occurrences of the `ref_time_value` in the `time_value`
245+
# column, which helps lines up the computation results with
246+
# corresponding rows of the input data, wouldn't quite apply here:
247+
# we'd want to line up with rows (from the same group) with
248+
# `version` matching the `ref_time_value`, but would still need to
249+
# summarize these rows somehow and drop the `time_value` input
250+
# column, but this summarization requires something like a
251+
# to-be-unique output key to determine a sensible number of rows
252+
# to output (and the contents of those rows).
253+
count =
254+
if (length(comp_effective_key_vars) != 0L) {
255+
sum(!duplicated(.data_group[, comp_effective_key_vars]))
256+
} else {
257+
# Same idea as above, but accounting for `duplicated` not
258+
# working as we want on 0 columns. (Should be the same as if
259+
# we were counting distinct values of a column defined as
260+
# `rep(val, target_n_rows)`.)
261+
if (nrow(.data_group) == 0L) {
262+
0L
263+
} else {
264+
1L
265+
}
266+
}
236267

237268
# If we get back an atomic vector
238269
if (is.atomic(comp_value)) {
@@ -241,7 +272,7 @@ grouped_epi_archive =
241272
}
242273
# If not a singleton, should be the right length, else abort
243274
else if (length(comp_value) != count) {
244-
Abort("If the slide computation returns an atomic vector, then it must have a single element, or else one element per appearance of the reference time value in the local window.")
275+
Abort('If the slide computation returns an atomic vector, then it must have either (a) a single element, or (b) one element per distinct combination of key variables, excluding the `time_value`, `version`, and grouping variables, that is present in the first argument to the computation.')
245276
}
246277
}
247278

@@ -256,7 +287,7 @@ grouped_epi_archive =
256287
}
257288
# Make into a list
258289
else {
259-
comp_value = split(comp_value, 1:nrow(comp_value))
290+
comp_value = split(comp_value, seq_len(nrow(comp_value)))
260291
}
261292
}
262293

@@ -265,10 +296,9 @@ grouped_epi_archive =
265296
Abort("The slide computation must return an atomic vector or a data frame.")
266297
}
267298

268-
# Note that we've already recycled comp value to make size stable,
269-
# so tibble() will just recycle time value appropriately
270-
return(tibble::tibble(time_value = time_value,
271-
!!new_col := comp_value))
299+
# Label every result row with the `ref_time_value`:
300+
return(tibble::tibble(time_value = rep(.env$ref_time_value, count),
301+
!!new_col := .env$comp_value))
272302
}
273303

274304
# If f is not missing, then just go ahead, slide by group
@@ -278,12 +308,12 @@ grouped_epi_archive =
278308
private$ungrouped$as_of(ref_time_value, min_time_value = ref_time_value - before) %>%
279309
dplyr::group_by(dplyr::across(tidyselect::all_of(private$vars)),
280310
.drop=private$drop) %>%
281-
dplyr::summarize(comp_one_grp(dplyr::cur_data_all(),
282-
f = f, ...,
283-
time_value = ref_time_value,
284-
key_vars = key_vars,
285-
new_col = new_col),
286-
.groups = groups)
311+
dplyr::group_modify(comp_one_grp,
312+
f = f, ...,
313+
ref_time_value = ref_time_value,
314+
comp_effective_key_vars = comp_effective_key_vars,
315+
new_col = new_col,
316+
.keep = TRUE)
287317
})
288318
}
289319

@@ -306,11 +336,10 @@ grouped_epi_archive =
306336
dplyr::group_by(dplyr::across(tidyselect::all_of(private$vars))) %>%
307337
dplyr::group_modify(comp_one_grp,
308338
f = f, quo = quo,
309-
time_value = ref_time_value,
310-
key_vars = key_vars,
339+
ref_time_value = ref_time_value,
340+
comp_effective_key_vars = comp_effective_key_vars,
311341
new_col = new_col,
312-
.keep = TRUE) %>%
313-
dplyr::ungroup()
342+
.keep = TRUE)
314343
})
315344
}
316345

R/methods-epi_df.R

+66-82
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,7 @@ as_tsibble.epi_df = function(x, key, ...) {
1919

2020
#' Base S3 methods for an `epi_df` object
2121
#'
22-
#' Print, summary, and `dplyr` verbs (that preserve class and attributes) for an
23-
#' `epi_df` object.
22+
#' Print and summary functions for an `epi_df` object.
2423
#'
2524
#' @param x The `epi_df` object.
2625
#' @param ... Additional arguments passed to methods.
@@ -65,119 +64,104 @@ summary.epi_df = function(object, ...) {
6564
dplyr::summarize(mean(.data$num)))))
6665
}
6766

67+
#' Drop any `epi_df` metadata and class on a data frame
68+
#'
69+
#' Useful in implementing `?dplyr_extending` when manipulations cause invariants
70+
#' of `epi_df`s to be violated and we need to return some other class. Note that
71+
#' this will maintain any grouping (keeping the `grouped_df` class and
72+
#' associated attributes, if present).
73+
#'
74+
#' @param x an `epi_df` or other data frame
75+
#' @return `x` with any metadata dropped and the `"epi_df"` class, if previously
76+
#' present, dropped
77+
#'
78+
#' @noRd
79+
decay_epi_df = function(x) {
80+
attributes(x)$metadata <- NULL
81+
class(x) <- class(x)[class(x) != "epi_df"]
82+
x
83+
}
84+
85+
# Implementing `dplyr_extending`: `geo_type` and `time_type` are scalar
86+
# attributes dependent on columns, and `other_keys` acts like an attribute
87+
# vectorized over columns; `dplyr_extending` advice says to implement
88+
# `dplyr_reconstruct`, 1d `[`, `dplyr_col_modify`, and `names<-`, but not
89+
# `dplyr_row_slice`. We'll implement `[` to allow either 1d or 2d.
90+
91+
#' @param data tibble or `epi_df` (`dplyr` feeds in former, but we may
92+
#' directly feed in latter from our other methods)
93+
#' @param template `epi_df` template to use to restore
94+
#' @return `epi_df` or degrade into `tbl_df`
95+
#' @importFrom dplyr dplyr_reconstruct
6896
#' @export
69-
`[.epi_df` <- function(x, i, j, drop = FALSE) {
70-
res <- NextMethod()
71-
72-
if (!is.data.frame(res)) return(res)
73-
74-
if (missing(i)) {
75-
i <- NULL
76-
}
77-
78-
if (missing(j)) {
79-
j <- NULL
80-
}
97+
#' @noRd
98+
dplyr_reconstruct.epi_df = function(data, template) {
99+
# Start from a reconstruction for the backing S3 classes; this ensures that we
100+
# keep any grouping that has been applied:
101+
res <- NextMethod()
81102

82103
cn <- names(res)
83-
104+
84105
# Duplicate columns, Abort
85106
dup_col_names = cn[duplicated(cn)]
86107
if (length(dup_col_names) != 0) {
87108
Abort(paste0("Column name(s) ",
88109
paste(unique(dup_col_names),
89110
collapse = ", "), " must not be duplicated."))
90-
}
111+
}
91112

92113
not_epi_df <- !("time_value" %in% cn) || !("geo_value" %in% cn)
93114

94115
if (not_epi_df) {
95-
attributes(res)$metadata <- NULL
96-
return(tibble::as_tibble(res))
116+
# If we're calling on an `epi_df` from one of our own functions, we need to
117+
# decay to a non-`epi_df` result. If `dplyr` is calling, `x` is a tibble,
118+
# `res` is not an `epi_df` yet (but might, e.g., be a `grouped_df`), and we
119+
# simply need to skip adding the metadata & class. Current `decay_epi_df`
120+
# should work in both cases.
121+
return(decay_epi_df(res))
97122
}
98123

99-
# Use reclass as safeguard (in case class &/or metadata are dropped)
100-
res <- reclass(res, attr(x, "metadata"))
124+
res <- reclass(res, attr(template, "metadata"))
125+
126+
# XXX we may want verify the `geo_type` and `time_type` here. If it's
127+
# significant overhead, we may also want to keep this less strict version
128+
# around and implement some extra S3 methods that use it, when appropriate.
101129

102130
# Amend additional metadata if some other_keys cols are dropped in the subset
103-
old_other_keys = attr(x, "metadata")$other_keys
131+
old_other_keys = attr(template, "metadata")$other_keys
104132
attr(res, "metadata")$other_keys <- old_other_keys[old_other_keys %in% cn]
105133

106134
res
107135
}
108136

109-
#' `dplyr` verbs
110-
#'
111-
#' `dplyr` verbs for `epi_df` objects, preserving class and attributes.
112-
#'
113-
#' @method arrange epi_df
114-
#' @param .data The `epi_df` object.
115-
#' @rdname print.epi_df
116-
#' @export
117-
arrange.epi_df = function(.data, ...) {
118-
metadata = attributes(.data)$metadata
119-
.data = NextMethod()
120-
reclass(.data, metadata)
121-
}
122-
123-
#' @method filter epi_df
124-
#' @rdname print.epi_df
125-
#' @export
126-
filter.epi_df = function(.data, ...) {
127-
metadata = attributes(.data)$metadata
128-
.data = NextMethod()
129-
reclass(.data, metadata)
130-
}
131-
132-
#' @method group_by epi_df
133-
#' @rdname print.epi_df
134137
#' @export
135-
group_by.epi_df = function(.data, ...) {
136-
metadata = attributes(.data)$metadata
137-
.data = NextMethod()
138-
reclass(.data, metadata)
139-
}
140-
141-
#' @method group_modify epi_df
142-
#' @rdname print.epi_df
143-
#' @export
144-
group_modify.epi_df = function(.data, ...) {
145-
metadata = attributes(.data)$metadata
146-
.data = NextMethod()
147-
reclass(.data, metadata)
148-
}
149-
150-
#' @method mutate epi_df
151-
#' @rdname print.epi_df
152-
#' @export
153-
mutate.epi_df = function(.data, ...) {
154-
metadata = attributes(.data)$metadata
155-
.data = NextMethod()
156-
reclass(.data, metadata)
138+
`[.epi_df` <- function(x, i, j, drop = FALSE) {
139+
res <- NextMethod()
140+
141+
if (!is.data.frame(res)) return(res)
142+
143+
dplyr::dplyr_reconstruct(res, x)
157144
}
158145

159-
#' @method relocate epi_df
160-
#' @rdname print.epi_df
146+
#' @importFrom dplyr dplyr_col_modify
161147
#' @export
162-
relocate.epi_df = function(.data, ...) {
163-
metadata = attributes(.data)$metadata
164-
.data = NextMethod()
165-
reclass(.data, metadata)
148+
dplyr_col_modify.epi_df = function(data, cols) {
149+
dplyr::dplyr_reconstruct(NextMethod(), data)
166150
}
167151

168-
#' @method rename epi_df
169-
#' @rdname print.epi_df
170152
#' @export
171-
rename.epi_df = function(.data, ...) {
172-
metadata = attributes(.data)$metadata
173-
.data = NextMethod()
174-
reclass(.data, metadata)
153+
`names<-.epi_df` = function(x, value) {
154+
old_names = names(x)
155+
old_other_keys = attributes(x)$metadata$other_keys
156+
result = NextMethod()
157+
attributes(x)$metadata$other_keys <- value[match(old_other_keys, old_names)]
158+
dplyr::dplyr_reconstruct(result, result)
175159
}
176160

177-
#' @method slice epi_df
161+
#' @method group_by epi_df
178162
#' @rdname print.epi_df
179163
#' @export
180-
slice.epi_df = function(.data, ...) {
164+
group_by.epi_df = function(.data, ...) {
181165
metadata = attributes(.data)$metadata
182166
.data = NextMethod()
183167
reclass(.data, metadata)

0 commit comments

Comments
 (0)