Skip to content

Commit 998a2a1

Browse files
committed
ARROW-11514: [R][C++] Bindings for paste(), paste0(), str_c()
Adds support for the string concatenation functions `paste()`, `paste0()`, and `str_c()` in dplyr verbs. Only the non-aggregating `collapse = NULL` case is currently supported. Closes #10547 from ianmcook/ARROW-11514 Authored-by: Ian Cook <[email protected]> Signed-off-by: Ian Cook <[email protected]>
1 parent 9aaf61c commit 998a2a1

File tree

7 files changed

+242
-3
lines changed

7 files changed

+242
-3
lines changed

cpp/src/arrow/compute/kernels/scalar_string.cc

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3587,10 +3587,12 @@ void AddBinaryJoin(FunctionRegistry* registry) {
35873587
"binary_join_element_wise", Arity::VarArgs(/*min_args=*/1),
35883588
&binary_join_element_wise_doc, &kDefaultJoinOptions);
35893589
for (const auto& ty : BaseBinaryTypes()) {
3590-
DCHECK_OK(
3591-
func->AddKernel({InputType(ty)}, ty,
3590+
ScalarKernel kernel{KernelSignature::Make({InputType(ty)}, ty, /*is_varargs=*/true),
35923591
GenerateTypeAgnosticVarBinaryBase<BinaryJoinElementWise>(ty),
3593-
BinaryJoinElementWiseState::Init));
3592+
BinaryJoinElementWiseState::Init};
3593+
kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
3594+
kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
3595+
DCHECK_OK(func->AddKernel(std::move(kernel)));
35943596
}
35953597
DCHECK_OK(registry->AddFunction(std::move(func)));
35963598
}

r/NAMESPACE

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,7 @@ export(MessageReader)
153153
export(MessageType)
154154
export(MetadataVersion)
155155
export(NullEncodingBehavior)
156+
export(NullHandlingBehavior)
156157
export(ParquetArrowReaderProperties)
157158
export(ParquetFileFormat)
158159
export(ParquetFileReader)

r/R/dplyr-functions.R

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,61 @@ nse_funcs$nchar <- function(x, type = "chars", allowNA = FALSE, keepNA = NA) {
215215
}
216216
}
217217

218+
nse_funcs$paste <- function(..., sep = " ", collapse = NULL, recycle0 = FALSE) {
219+
assert_that(
220+
is.null(collapse),
221+
msg = "paste() with the collapse argument is not yet supported in Arrow"
222+
)
223+
if (!inherits(sep, "Expression")) {
224+
assert_that(!is.na(sep), msg = "Invalid separator")
225+
}
226+
arrow_string_join_function(NullHandlingBehavior$REPLACE, "NA")(..., sep)
227+
}
228+
229+
nse_funcs$paste0 <- function(..., collapse = NULL, recycle0 = FALSE) {
230+
assert_that(
231+
is.null(collapse),
232+
msg = "paste0() with the collapse argument is not yet supported in Arrow"
233+
)
234+
arrow_string_join_function(NullHandlingBehavior$REPLACE, "NA")(..., "")
235+
}
236+
237+
nse_funcs$str_c <- function(..., sep = "", collapse = NULL) {
238+
assert_that(
239+
is.null(collapse),
240+
msg = "str_c() with the collapse argument is not yet supported in Arrow"
241+
)
242+
arrow_string_join_function(NullHandlingBehavior$EMIT_NULL)(..., sep)
243+
}
244+
245+
arrow_string_join_function <- function(null_handling, null_replacement = NULL) {
246+
# the `binary_join_element_wise` Arrow C++ compute kernel takes the separator
247+
# as the last argument, so pass `sep` as the last dots arg to this function
248+
function(...) {
249+
args <- lapply(list(...), function(arg) {
250+
# handle scalar literal args, and cast all args to string for
251+
# consistency with base::paste(), base::paste0(), and stringr::str_c()
252+
if (!inherits(arg, "Expression")) {
253+
assert_that(
254+
length(arg) == 1,
255+
msg = "Literal vectors of length != 1 not supported in string concatenation"
256+
)
257+
Expression$scalar(as.character(arg))
258+
} else {
259+
nse_funcs$as.character(arg)
260+
}
261+
})
262+
Expression$create(
263+
"binary_join_element_wise",
264+
args = args,
265+
options = list(
266+
null_handling = null_handling,
267+
null_replacement = null_replacement
268+
)
269+
)
270+
}
271+
}
272+
218273
nse_funcs$str_trim <- function(string, side = c("both", "left", "right")) {
219274
side <- match.arg(side)
220275
trim_fun <- switch(side,

r/R/enums.R

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,3 +140,9 @@ QuantileInterpolation <- enum("QuantileInterpolation",
140140
NullEncodingBehavior <- enum("NullEncodingBehavior",
141141
ENCODE = 0L, MASK = 1L
142142
)
143+
144+
#' @export
145+
#' @rdname enums
146+
NullHandlingBehavior <- enum("NullHandlingBehavior",
147+
EMIT_NULL = 0L, SKIP = 1L, REPLACE = 2L
148+
)

r/man/enums.Rd

Lines changed: 5 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

r/src/compute.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,20 @@ std::shared_ptr<arrow::compute::FunctionOptions> make_compute_options(
218218
return make_cast_options(options);
219219
}
220220

221+
if (func_name == "binary_join_element_wise") {
222+
using Options = arrow::compute::JoinOptions;
223+
auto out = std::make_shared<Options>(Options::Defaults());
224+
if (!Rf_isNull(options["null_handling"])) {
225+
out->null_handling =
226+
cpp11::as_cpp<enum arrow::compute::JoinOptions::NullHandlingBehavior>(
227+
options["null_handling"]);
228+
}
229+
if (!Rf_isNull(options["null_replacement"])) {
230+
out->null_replacement = cpp11::as_cpp<std::string>(options["null_replacement"]);
231+
}
232+
return out;
233+
}
234+
221235
if (func_name == "match_substring" || func_name == "match_substring_regex") {
222236
using Options = arrow::compute::MatchSubstringOptions;
223237
bool ignore_case = false;

r/tests/testthat/test-dplyr-string-functions.R

Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,162 @@ skip_if_not_available("utf8proc")
2121
library(dplyr)
2222
library(stringr)
2323

24+
test_that("paste, paste0, and str_c", {
25+
df <- tibble(
26+
v = c("A", "B", "C"),
27+
w = c("a", "b", "c"),
28+
x = c("d", NA_character_, "f"),
29+
y = c(NA_character_, "h", "i"),
30+
z = c(1.1, 2.2, NA)
31+
)
32+
x <- Expression$field_ref("x")
33+
y <- Expression$field_ref("y")
34+
35+
# no NAs in data
36+
expect_dplyr_equal(
37+
input %>%
38+
transmute(paste(v, w)) %>%
39+
collect(),
40+
df
41+
)
42+
expect_dplyr_equal(
43+
input %>%
44+
transmute(paste(v, w, sep = "-")) %>%
45+
collect(),
46+
df
47+
)
48+
expect_dplyr_equal(
49+
input %>%
50+
transmute(paste0(v, w)) %>%
51+
collect(),
52+
df
53+
)
54+
expect_dplyr_equal(
55+
input %>%
56+
transmute(str_c(v, w)) %>%
57+
collect(),
58+
df
59+
)
60+
expect_dplyr_equal(
61+
input %>%
62+
transmute(str_c(v, w, sep = "+")) %>%
63+
collect(),
64+
df
65+
)
66+
67+
# NAs in data
68+
expect_dplyr_equal(
69+
input %>%
70+
transmute(paste(x, y)) %>%
71+
collect(),
72+
df
73+
)
74+
expect_dplyr_equal(
75+
input %>%
76+
transmute(paste(x, y, sep = "-")) %>%
77+
collect(),
78+
df
79+
)
80+
expect_dplyr_equal(
81+
input %>%
82+
transmute(str_c(x, y)) %>%
83+
collect(),
84+
df
85+
)
86+
87+
# non-character column in dots
88+
expect_dplyr_equal(
89+
input %>%
90+
transmute(paste0(x, y, z)) %>%
91+
collect(),
92+
df
93+
)
94+
95+
# literal string in dots
96+
expect_dplyr_equal(
97+
input %>%
98+
transmute(paste(x, "foo", y)) %>%
99+
collect(),
100+
df
101+
)
102+
103+
# literal NA in dots
104+
expect_dplyr_equal(
105+
input %>%
106+
transmute(paste(x, NA, y)) %>%
107+
collect(),
108+
df
109+
)
110+
111+
# expressions in dots
112+
expect_dplyr_equal(
113+
input %>%
114+
transmute(paste0(x, toupper(y), as.character(z))) %>%
115+
collect(),
116+
df
117+
)
118+
119+
# sep is literal NA
120+
# errors in paste() (consistent with base::paste())
121+
expect_error(
122+
nse_funcs$paste(x, y, sep = NA_character_),
123+
"Invalid separator"
124+
)
125+
# emits null in str_c() (consistent with stringr::str_c())
126+
expect_dplyr_equal(
127+
input %>%
128+
transmute(str_c(x, y, sep = NA_character_)) %>%
129+
collect(),
130+
df
131+
)
132+
133+
# sep passed in dots to paste0 (which doesn't take a sep argument)
134+
expect_dplyr_equal(
135+
input %>%
136+
transmute(paste0(x, y, sep = "-")) %>%
137+
collect(),
138+
df
139+
)
140+
141+
# known differences
142+
143+
# arrow allows the separator to be an array
144+
expect_equal(
145+
df %>%
146+
Table$create() %>%
147+
transmute(result = paste(x, y, sep = w)) %>%
148+
collect(),
149+
df %>%
150+
transmute(result = paste(x, w, y, sep = ""))
151+
)
152+
153+
# expected errors
154+
155+
# collapse argument not supported
156+
expect_error(
157+
nse_funcs$paste(x, y, collapse = ""),
158+
"collapse"
159+
)
160+
expect_error(
161+
nse_funcs$paste0(x, y, collapse = ""),
162+
"collapse"
163+
)
164+
expect_error(
165+
nse_funcs$str_c(x, y, collapse = ""),
166+
"collapse"
167+
)
168+
169+
# literal vectors of length != 1 not supported
170+
expect_error(
171+
nse_funcs$paste(x, character(0), y),
172+
"Literal vectors of length != 1 not supported in string concatenation"
173+
)
174+
expect_error(
175+
nse_funcs$paste(x, c(",", ";"), y),
176+
"Literal vectors of length != 1 not supported in string concatenation"
177+
)
178+
})
179+
24180
test_that("grepl with ignore.case = FALSE and fixed = TRUE", {
25181
df <- tibble(x = c("Foo", "bar"))
26182
expect_dplyr_equal(

0 commit comments

Comments
 (0)