Skip to content

Commit

Permalink
ARROW-11514: [R][C++] Bindings for paste(), paste0(), str_c()
Browse files Browse the repository at this point in the history
Adds support for the string concatenation functions `paste()`, `paste0()`, and `str_c()` in dplyr verbs. Only the non-aggregating `collapse = NULL` case is currently supported.

Closes #10547 from ianmcook/ARROW-11514

Authored-by: Ian Cook <[email protected]>
Signed-off-by: Ian Cook <[email protected]>
  • Loading branch information
ianmcook committed Jun 23, 2021
1 parent 9aaf61c commit 998a2a1
Show file tree
Hide file tree
Showing 7 changed files with 242 additions and 3 deletions.
8 changes: 5 additions & 3 deletions cpp/src/arrow/compute/kernels/scalar_string.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3587,10 +3587,12 @@ void AddBinaryJoin(FunctionRegistry* registry) {
"binary_join_element_wise", Arity::VarArgs(/*min_args=*/1),
&binary_join_element_wise_doc, &kDefaultJoinOptions);
for (const auto& ty : BaseBinaryTypes()) {
DCHECK_OK(
func->AddKernel({InputType(ty)}, ty,
ScalarKernel kernel{KernelSignature::Make({InputType(ty)}, ty, /*is_varargs=*/true),
GenerateTypeAgnosticVarBinaryBase<BinaryJoinElementWise>(ty),
BinaryJoinElementWiseState::Init));
BinaryJoinElementWiseState::Init};
kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
DCHECK_OK(func->AddKernel(std::move(kernel)));
}
DCHECK_OK(registry->AddFunction(std::move(func)));
}
Expand Down
1 change: 1 addition & 0 deletions r/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,7 @@ export(MessageReader)
export(MessageType)
export(MetadataVersion)
export(NullEncodingBehavior)
export(NullHandlingBehavior)
export(ParquetArrowReaderProperties)
export(ParquetFileFormat)
export(ParquetFileReader)
Expand Down
55 changes: 55 additions & 0 deletions r/R/dplyr-functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,61 @@ nse_funcs$nchar <- function(x, type = "chars", allowNA = FALSE, keepNA = NA) {
}
}

nse_funcs$paste <- function(..., sep = " ", collapse = NULL, recycle0 = FALSE) {
assert_that(
is.null(collapse),
msg = "paste() with the collapse argument is not yet supported in Arrow"
)
if (!inherits(sep, "Expression")) {
assert_that(!is.na(sep), msg = "Invalid separator")
}
arrow_string_join_function(NullHandlingBehavior$REPLACE, "NA")(..., sep)
}

nse_funcs$paste0 <- function(..., collapse = NULL, recycle0 = FALSE) {
assert_that(
is.null(collapse),
msg = "paste0() with the collapse argument is not yet supported in Arrow"
)
arrow_string_join_function(NullHandlingBehavior$REPLACE, "NA")(..., "")
}

nse_funcs$str_c <- function(..., sep = "", collapse = NULL) {
assert_that(
is.null(collapse),
msg = "str_c() with the collapse argument is not yet supported in Arrow"
)
arrow_string_join_function(NullHandlingBehavior$EMIT_NULL)(..., sep)
}

arrow_string_join_function <- function(null_handling, null_replacement = NULL) {
# the `binary_join_element_wise` Arrow C++ compute kernel takes the separator
# as the last argument, so pass `sep` as the last dots arg to this function
function(...) {
args <- lapply(list(...), function(arg) {
# handle scalar literal args, and cast all args to string for
# consistency with base::paste(), base::paste0(), and stringr::str_c()
if (!inherits(arg, "Expression")) {
assert_that(
length(arg) == 1,
msg = "Literal vectors of length != 1 not supported in string concatenation"
)
Expression$scalar(as.character(arg))
} else {
nse_funcs$as.character(arg)
}
})
Expression$create(
"binary_join_element_wise",
args = args,
options = list(
null_handling = null_handling,
null_replacement = null_replacement
)
)
}
}

nse_funcs$str_trim <- function(string, side = c("both", "left", "right")) {
side <- match.arg(side)
trim_fun <- switch(side,
Expand Down
6 changes: 6 additions & 0 deletions r/R/enums.R
Original file line number Diff line number Diff line change
Expand Up @@ -140,3 +140,9 @@ QuantileInterpolation <- enum("QuantileInterpolation",
NullEncodingBehavior <- enum("NullEncodingBehavior",
ENCODE = 0L, MASK = 1L
)

#' @export
#' @rdname enums
NullHandlingBehavior <- enum("NullHandlingBehavior",
EMIT_NULL = 0L, SKIP = 1L, REPLACE = 2L
)
5 changes: 5 additions & 0 deletions r/man/enums.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 14 additions & 0 deletions r/src/compute.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,20 @@ std::shared_ptr<arrow::compute::FunctionOptions> make_compute_options(
return make_cast_options(options);
}

if (func_name == "binary_join_element_wise") {
using Options = arrow::compute::JoinOptions;
auto out = std::make_shared<Options>(Options::Defaults());
if (!Rf_isNull(options["null_handling"])) {
out->null_handling =
cpp11::as_cpp<enum arrow::compute::JoinOptions::NullHandlingBehavior>(
options["null_handling"]);
}
if (!Rf_isNull(options["null_replacement"])) {
out->null_replacement = cpp11::as_cpp<std::string>(options["null_replacement"]);
}
return out;
}

if (func_name == "match_substring" || func_name == "match_substring_regex") {
using Options = arrow::compute::MatchSubstringOptions;
bool ignore_case = false;
Expand Down
156 changes: 156 additions & 0 deletions r/tests/testthat/test-dplyr-string-functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,162 @@ skip_if_not_available("utf8proc")
library(dplyr)
library(stringr)

test_that("paste, paste0, and str_c", {
df <- tibble(
v = c("A", "B", "C"),
w = c("a", "b", "c"),
x = c("d", NA_character_, "f"),
y = c(NA_character_, "h", "i"),
z = c(1.1, 2.2, NA)
)
x <- Expression$field_ref("x")
y <- Expression$field_ref("y")

# no NAs in data
expect_dplyr_equal(
input %>%
transmute(paste(v, w)) %>%
collect(),
df
)
expect_dplyr_equal(
input %>%
transmute(paste(v, w, sep = "-")) %>%
collect(),
df
)
expect_dplyr_equal(
input %>%
transmute(paste0(v, w)) %>%
collect(),
df
)
expect_dplyr_equal(
input %>%
transmute(str_c(v, w)) %>%
collect(),
df
)
expect_dplyr_equal(
input %>%
transmute(str_c(v, w, sep = "+")) %>%
collect(),
df
)

# NAs in data
expect_dplyr_equal(
input %>%
transmute(paste(x, y)) %>%
collect(),
df
)
expect_dplyr_equal(
input %>%
transmute(paste(x, y, sep = "-")) %>%
collect(),
df
)
expect_dplyr_equal(
input %>%
transmute(str_c(x, y)) %>%
collect(),
df
)

# non-character column in dots
expect_dplyr_equal(
input %>%
transmute(paste0(x, y, z)) %>%
collect(),
df
)

# literal string in dots
expect_dplyr_equal(
input %>%
transmute(paste(x, "foo", y)) %>%
collect(),
df
)

# literal NA in dots
expect_dplyr_equal(
input %>%
transmute(paste(x, NA, y)) %>%
collect(),
df
)

# expressions in dots
expect_dplyr_equal(
input %>%
transmute(paste0(x, toupper(y), as.character(z))) %>%
collect(),
df
)

# sep is literal NA
# errors in paste() (consistent with base::paste())
expect_error(
nse_funcs$paste(x, y, sep = NA_character_),
"Invalid separator"
)
# emits null in str_c() (consistent with stringr::str_c())
expect_dplyr_equal(
input %>%
transmute(str_c(x, y, sep = NA_character_)) %>%
collect(),
df
)

# sep passed in dots to paste0 (which doesn't take a sep argument)
expect_dplyr_equal(
input %>%
transmute(paste0(x, y, sep = "-")) %>%
collect(),
df
)

# known differences

# arrow allows the separator to be an array
expect_equal(
df %>%
Table$create() %>%
transmute(result = paste(x, y, sep = w)) %>%
collect(),
df %>%
transmute(result = paste(x, w, y, sep = ""))
)

# expected errors

# collapse argument not supported
expect_error(
nse_funcs$paste(x, y, collapse = ""),
"collapse"
)
expect_error(
nse_funcs$paste0(x, y, collapse = ""),
"collapse"
)
expect_error(
nse_funcs$str_c(x, y, collapse = ""),
"collapse"
)

# literal vectors of length != 1 not supported
expect_error(
nse_funcs$paste(x, character(0), y),
"Literal vectors of length != 1 not supported in string concatenation"
)
expect_error(
nse_funcs$paste(x, c(",", ";"), y),
"Literal vectors of length != 1 not supported in string concatenation"
)
})

test_that("grepl with ignore.case = FALSE and fixed = TRUE", {
df <- tibble(x = c("Foo", "bar"))
expect_dplyr_equal(
Expand Down

0 comments on commit 998a2a1

Please sign in to comment.