1313# ' size limit on any discrete INSERT INTO statement.
1414# '
1515# ' @param value The original data frame.
16- # ' @param chunk_size Maximum size (in bytes) of each unique chunk. Default to
17- # ' 750,000 bytes .
18- # ' @param chunk_fields A character vector of existing field names that are used
19- # ' to split the data frame .
16+ # ' @param base_chunk_fields A character vector of existing field names that are
17+ # ' used to split the data frame before checking the chunk size .
18+ # ' @param chunk_size Maximum size (in bytes) of the VALUES statement encoding
19+ # ' each unique chunk. Default to 1,000,000 bytes (i.e. 1Mb) .
2020# ' @param new_chunk_field_name A string indicating the new chunk field name.
21- # ' Default to "chunk ".
21+ # ' Default to "aux_chunk_idx ".
2222# ' @importFrom rlang :=
2323# ' @export
2424# ' @examples
2525# ' \dontrun{
2626# ' # returns the original data frame because it's within size
2727# ' add_chunk(iris)
28- # ' # add a new chunk_idx field
28+ # ' # add a new aux_chunk_idx field
2929# ' add_chunk(iris, chunk_size = 2000)
30- # ' # the new chunk_idx field is added on top of Species
31- # ' add_chunk(iris, chunk_size = 2000, chunk_fields = c("Species"))
30+ # ' # the new aux_chunk_idx field is added on top of Species
31+ # ' add_chunk(iris, chunk_size = 2000, base_chunk_fields = c("Species"))
3232# ' }
3333add_chunk <- function (
34- value , chunk_size = 7.5e5 ,
35- chunk_fields = NULL , new_chunk_field_name = " chunk_idx "
34+ value , base_chunk_fields = NULL , chunk_size = 1e6 ,
35+ new_chunk_field_name = " aux_chunk_idx "
3636) {
3737 .add_chunk <- function (value , start = 1L ) {
3838 if (new_chunk_field_name %in% colnames(value )) {
@@ -41,17 +41,23 @@ add_chunk <- function(
4141 call. = FALSE
4242 )
4343 }
44- n_chunks <- (as.integer(utils :: object.size(value )) %/% chunk_size ) + 1
45- chunk_size <- nrow(value ) %/% n_chunks
44+ sample_value <- dplyr :: slice(
45+ value , sample(1 : nrow(value ), 100 , replace = TRUE )
46+ )
47+ sample_value_query_size <- utils :: object.size(
48+ .create_values_statement(dummyPrestoConnection(), sample_value )
49+ )
50+ avg_row_query_size = as.integer(sample_value_query_size )/ 100
51+ n_rows_per_chunk <- chunk_size %/% avg_row_query_size
4652 dplyr :: mutate(
4753 dplyr :: ungroup(value ),
4854 !! rlang :: sym(new_chunk_field_name ) : =
49- start + as.integer((dplyr :: row_number() - 1L ) %/% chunk_size )
55+ start + as.integer((dplyr :: row_number() - 1L ) %/% n_rows_per_chunk )
5056 )
5157 }
5258
53- if (! is.null(chunk_fields )) {
54- split_values <- dplyr :: group_split(value , !!! rlang :: syms(chunk_fields ))
59+ if (! is.null(base_chunk_fields )) {
60+ split_values <- dplyr :: group_split(value , !!! rlang :: syms(base_chunk_fields ))
5561 start <- 0L
5662 res <- vector(mode = " list" , length = length(split_values ))
5763 for (i in seq_along(res )) {
@@ -65,7 +71,10 @@ add_chunk <- function(
6571 return (dplyr :: bind_rows(res ))
6672 }
6773 } else {
68- if (utils :: object.size(value ) < = chunk_size ) {
74+ value_query_size <- utils :: object.size(
75+ .create_values_statement(dummyPrestoConnection(), value )
76+ )
77+ if (value_query_size < = chunk_size ) {
6978 return (value )
7079 } else {
7180 return (.add_chunk(value ))
0 commit comments