Skip to content

Commit 66d1172

Browse files
authored
Merge pull request #215 from cmu-delphi/lcb/fix-merge-truncate-data-table-with-false
Fix `epix_merge` `sync="truncate"` err (`with=F` no `j`), add tests
2 parents b4e6158 + 1add677 commit 66d1172

File tree

2 files changed

+54
-14
lines changed

2 files changed

+54
-14
lines changed

R/methods-epi_archive.R

+28-10
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ epix_fill_through_version = function(x, fill_versions_end,
152152
#' # vs. mutating x to hold the merge result:
153153
#' x$merge(y)
154154
#'
155-
#' @importFrom data.table key set
155+
#' @importFrom data.table key set setkeyv
156156
#' @export
157157
epix_merge = function(x, y,
158158
sync = c("forbid","na","locf","truncate"),
@@ -215,18 +215,36 @@ epix_merge = function(x, y,
215215
y_DT = epix_fill_through_version(y, new_versions_end, sync)$DT
216216
} else if (sync == "truncate") {
217217
new_versions_end = min(x$versions_end, y$versions_end)
218-
x_DT = x$DT[x[["DT"]][["version"]] <= new_versions_end, with=FALSE]
219-
y_DT = y$DT[y[["DT"]][["version"]] <= new_versions_end, with=FALSE]
218+
x_DT = x$DT[x[["DT"]][["version"]] <= new_versions_end, names(x$DT), with=FALSE]
219+
y_DT = y$DT[y[["DT"]][["version"]] <= new_versions_end, names(y$DT), with=FALSE]
220220
} else Abort("unimplemented")
221221

222-
if (!identical(key(x$DT), key(x_DT)) || !identical(key(y$DT), key(y_DT))) {
223-
Abort("preprocessing of data tables in merge changed the key unexpectedly",
224-
internal=TRUE)
222+
# key(x_DT) should be the same as key(x$DT) and key(y_DT) should be the same
223+
# as key(y$DT). Below, we only use {x,y}_DT in the code (making it easier to
224+
# split the code into separate functions if we wish), but still refer to
225+
# {x,y}$DT in the error messages (further relying on this assumption).
226+
#
227+
# Check&ensure that the above assumption; if it didn't already hold, we likely
228+
# have a bug in the preprocessing, a weird/invalid archive as input, and/or a
229+
# data.table version with different semantics (which may break other parts of
230+
# our code).
231+
x_DT_key_as_expected = identical(key(x$DT), key(x_DT))
232+
y_DT_key_as_expected = identical(key(y$DT), key(y_DT))
233+
if (!x_DT_key_as_expected || !y_DT_key_as_expected) {
234+
Warn("
235+
`epiprocess` internal warning (please report): pre-processing for
236+
epix_merge unexpectedly resulted in an intermediate data table (or
237+
tables) with a different key than the corresponding input archive.
238+
Manually setting intermediate data table keys to the expected values.
239+
", internal=TRUE)
240+
setkeyv(x_DT, key(x$DT))
241+
setkeyv(y_DT, key(y$DT))
225242
}
226-
## key(x_DT) should be the same as key(x$DT) and key(y_DT) should be the same
227-
## as key(y$DT). If we want to break this function into parts it makes sense
228-
## to use {x,y}_DT below, but this makes the error checks and messages look a
229-
## little weird and rely on the key-matching assumption above.
243+
# Without some sort of annotations of what various columns represent, we can't
244+
# do something that makes sense when merging archives with mismatched keys.
245+
# E.g., even if we assume extra keys represent demographic breakdowns, a
246+
# sensible default treatment of count-type and rate-type value columns would
247+
# differ.
230248
if (!identical(sort(key(x_DT)), sort(key(y_DT)))) {
231249
Abort("
232250
The archives must have the same set of key column names; if the

tests/testthat/test-epix_merge.R

+26-4
Original file line numberDiff line numberDiff line change
@@ -121,9 +121,13 @@ local({
121121
})
122122

123123
local({
124-
x = as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=1L, x_value=10L))
125-
y = as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=5L, y_value=20L))
126-
print(epix_merge(x,y, sync = "na"))
124+
x = as_epi_archive(
125+
tibble::tibble(geo_value=1L, time_value=1L, version=1L, x_value=10L),
126+
versions_end = 3L
127+
)
128+
y = as_epi_archive(
129+
tibble::tibble(geo_value=1L, time_value=1L, version=5L, y_value=20L)
130+
)
127131
test_that('epix_merge forbids on sync default or "forbid"', {
128132
expect_error(epix_merge(x,y),
129133
class="epiprocess__epix_merge_unresolved_sync")
@@ -136,8 +140,10 @@ local({
136140
as_epi_archive(tibble::tribble(
137141
~geo_value, ~time_value, ~version, ~x_value, ~y_value,
138142
1L, 1L, 1L, 10L, NA_integer_, # x updated, y not observed yet
139-
1L, 1L, 2L, NA_integer_, NA_integer_, # NA-ing out x, y not observed yet
143+
1L, 1L, 4L, NA_integer_, NA_integer_, # NA-ing out x, y not observed yet
140144
1L, 1L, 5L, NA_integer_, 20L, # x still NA, y updated
145+
# (we should not have a y vals -> NA update here; version 5 should be
146+
# the `versions_end` of the result)
141147
), clobberable_versions_start=1L)
142148
)
143149
})
@@ -151,6 +157,16 @@ local({
151157
), clobberable_versions_start=1L)
152158
)
153159
})
160+
test_that('epix_merge sync="truncate" works', {
161+
expect_equal(
162+
epix_merge(x,y, sync = "truncate"),
163+
as_epi_archive(tibble::tribble(
164+
~geo_value, ~time_value, ~version, ~x_value, ~y_value,
165+
1L, 1L, 1L, 10L, NA_integer_, # x updated, y not observed yet
166+
# y's update beyond x's last update has been truncated
167+
), clobberable_versions_start=1L, versions_end=3L)
168+
)
169+
})
154170
x_no_conflict = as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=1L, x_value=10L))
155171
y_no_conflict = as_epi_archive(tibble::tibble(geo_value=1L, time_value=1L, version=1L, y_value=20L))
156172
xy_no_conflict_expected = as_epi_archive(tibble::tribble(
@@ -178,6 +194,12 @@ local({
178194
xy_no_conflict_expected
179195
)
180196
})
197+
test_that('epix_merge sync="truncate" on no-conflict works', {
198+
expect_equal(
199+
epix_merge(x_no_conflict, y_no_conflict, sync = "truncate"),
200+
xy_no_conflict_expected
201+
)
202+
})
181203
})
182204

183205

0 commit comments

Comments
 (0)