Skip to content

Commit 47eb171

Browse files
committed
add_chunk() uses VALUES statement to compute the number of rows in each chunk
1 parent 786b86a commit 47eb171

File tree

4 files changed

+52
-40
lines changed

4 files changed

+52
-40
lines changed

R/chunk.R

Lines changed: 25 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -13,26 +13,26 @@
1313
#' size limit on any discrete INSERT INTO statement.
1414
#'
1515
#' @param value The original data frame.
16-
#' @param chunk_size Maximum size (in bytes) of each unique chunk. Default to
17-
#' 750,000 bytes.
18-
#' @param chunk_fields A character vector of existing field names that are used
19-
#' to split the data frame.
16+
#' @param base_chunk_fields A character vector of existing field names that are
17+
#' used to split the data frame before checking the chunk size.
18+
#' @param chunk_size Maximum size (in bytes) of the VALUES statement encoding
19+
#' each unique chunk. Default to 1,000,000 bytes (i.e. 1Mb).
2020
#' @param new_chunk_field_name A string indicating the new chunk field name.
21-
#' Default to "chunk".
21+
#' Default to "aux_chunk_idx".
2222
#' @importFrom rlang :=
2323
#' @export
2424
#' @examples
2525
#' \dontrun{
2626
#' # returns the original data frame because it's within size
2727
#' add_chunk(iris)
28-
#' # add a new chunk_idx field
28+
#' # add a new aux_chunk_idx field
2929
#' add_chunk(iris, chunk_size = 2000)
30-
#' # the new chunk_idx field is added on top of Species
31-
#' add_chunk(iris, chunk_size = 2000, chunk_fields = c("Species"))
30+
#' # the new aux_chunk_idx field is added on top of Species
31+
#' add_chunk(iris, chunk_size = 2000, base_chunk_fields = c("Species"))
3232
#' }
3333
add_chunk <- function(
34-
value, chunk_size = 7.5e5,
35-
chunk_fields = NULL, new_chunk_field_name = "chunk_idx"
34+
value, base_chunk_fields = NULL, chunk_size = 1e6,
35+
new_chunk_field_name = "aux_chunk_idx"
3636
) {
3737
.add_chunk <- function(value, start = 1L) {
3838
if (new_chunk_field_name %in% colnames(value)) {
@@ -41,17 +41,23 @@ add_chunk <- function(
4141
call. = FALSE
4242
)
4343
}
44-
n_chunks <- (as.integer(utils::object.size(value)) %/% chunk_size) + 1
45-
chunk_size <- nrow(value) %/% n_chunks
44+
sample_value <- dplyr::slice(
45+
value, sample(1:nrow(value), 100, replace = TRUE)
46+
)
47+
sample_value_query_size <- utils::object.size(
48+
.create_values_statement(dummyPrestoConnection(), sample_value)
49+
)
50+
avg_row_query_size = as.integer(sample_value_query_size)/100
51+
n_rows_per_chunk <- chunk_size %/% avg_row_query_size
4652
dplyr::mutate(
4753
dplyr::ungroup(value),
4854
!!rlang::sym(new_chunk_field_name) :=
49-
start + as.integer((dplyr::row_number() - 1L) %/% chunk_size)
55+
start + as.integer((dplyr::row_number() - 1L) %/% n_rows_per_chunk)
5056
)
5157
}
5258

53-
if (!is.null(chunk_fields)) {
54-
split_values <- dplyr::group_split(value, !!!rlang::syms(chunk_fields))
59+
if (!is.null(base_chunk_fields)) {
60+
split_values <- dplyr::group_split(value, !!!rlang::syms(base_chunk_fields))
5561
start <- 0L
5662
res <- vector(mode = "list", length = length(split_values))
5763
for (i in seq_along(res)) {
@@ -65,7 +71,10 @@ add_chunk <- function(
6571
return(dplyr::bind_rows(res))
6672
}
6773
} else {
68-
if (utils::object.size(value) <= chunk_size) {
74+
value_query_size <- utils::object.size(
75+
.create_values_statement(dummyPrestoConnection(), value)
76+
)
77+
if (value_query_size <= chunk_size) {
6978
return(value)
7079
} else {
7180
return(.add_chunk(value))

R/dbWriteTable.R

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -102,13 +102,10 @@ NULL
102102
{
103103
if (!found || overwrite) {
104104
if (use.one.query) {
105-
sql_values <- DBI::sqlData(conn, value)
106-
fields <- DBI::dbQuoteIdentifier(conn, names(sql_values))
107-
rows <- do.call(paste, c(unname(sql_values), sep = ", "))
105+
fields <- DBI::dbQuoteIdentifier(conn, colnames(value))
108106
sql <- DBI::SQL(paste0(
109107
"SELECT * FROM (\n",
110-
"VALUES\n",
111-
paste0(" (", rows, ")", collapse = ",\n"),
108+
.create_values_statement(conn, value),
112109
") AS t (", paste(fields, collapse = ", "), ")\n"
113110
))
114111
dbCreateTableAs(
@@ -180,3 +177,9 @@ setMethod(
180177
signature("PrestoConnection", "ANY", "data.frame"),
181178
.dbWriteTable
182179
)
180+
181+
.create_values_statement <- function(conn, value, row.names = FALSE) {
182+
sql_values <- DBI::sqlData(conn, value, row.names)
183+
rows <- do.call(paste, c(unname(sql_values), sep = ", "))
184+
DBI::SQL(paste0("VALUES\n", paste0(" (", rows, ")", collapse = ",\n")))
185+
}

man/add_chunk.Rd

Lines changed: 11 additions & 11 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tests/testthat/test-add_chunk.R

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ test_that("add_chunk returns the original data frame if within size", {
1414
iris
1515
)
1616
expect_equal_data_frame(
17-
add_chunk(iris, chunk_fields = c("Species")),
17+
add_chunk(iris, base_chunk_fields = c("Species")),
1818
iris
1919
)
2020
})
@@ -23,9 +23,9 @@ test_that("add_chunk adds a new field when larger than size limit", {
2323
chunk_iris <- add_chunk(iris, chunk_size = 2000)
2424
expect_equal(
2525
colnames(chunk_iris),
26-
c(colnames(iris), "chunk_idx")
26+
c(colnames(iris), "aux_chunk_idx")
2727
)
28-
expect_equal(class(chunk_iris$chunk_idx), "integer")
28+
expect_equal(class(chunk_iris$aux_chunk_idx), "integer")
2929
chunk_iris_2 <-
3030
add_chunk(iris, chunk_size = 2000, new_chunk_field_name = "chunk")
3131
expect_equal(
@@ -34,14 +34,14 @@ test_that("add_chunk adds a new field when larger than size limit", {
3434
)
3535
expect_equal(class(chunk_iris_2$chunk), "integer")
3636
chunk_iris_field <-
37-
add_chunk(iris, chunk_size = 2000, chunk_fields = c("Species"))
37+
add_chunk(iris, chunk_size = 2000, base_chunk_fields = c("Species"))
3838
expect_equal(
3939
colnames(chunk_iris_field),
40-
c(colnames(iris), "chunk_idx")
40+
c(colnames(iris), "aux_chunk_idx")
4141
)
42-
expect_equal(class(chunk_iris_field$chunk_idx), "integer")
42+
expect_equal(class(chunk_iris_field$aux_chunk_idx), "integer")
4343
expect_equal(
44-
nrow(dplyr::count(chunk_iris_field, Species, chunk_idx)),
45-
6L
44+
nrow(dplyr::count(chunk_iris_field, Species, aux_chunk_idx)),
45+
9L
4646
)
4747
})

0 commit comments

Comments
 (0)