Skip to content

Commit e4f0754

Browse files
authored
feat(r): Add bindings for IPC reader (#390)
This PR adds bindings to nanoarrow's IPC reader from R. The entrypoint for a user is `read_nanoarrow()`, which accepts raw vectors, connections, and file paths (thin wrapper around connections). It also fixes a number of compiler warnings in the IPC extension. The implementation is not particularly complicated from the R side, but the main drawback of adding IPC support is that the flatbuffers implementation (flatcc) actively does not care about gcc compiler warnings (whereas CRAN actively cares about them). These are all slower than the arrow package, which has more tools at its disposal to prevent copies. ``` r library(arrow, warn.conflicts = FALSE) library(nanoarrow) # Basic read example tf <- tempfile() write_ipc_stream(dplyr::starwars, tf) read_nanoarrow(tf) |> tibble::as_tibble() #> # A tibble: 87 × 14 #> name height mass hair_color skin_color eye_color birth_year sex gender #> <chr> <int> <dbl> <chr> <chr> <chr> <dbl> <chr> <chr> #> 1 Luke Sk… 172 77 blond fair blue 19 male mascu… #> 2 C-3PO 167 75 <NA> gold yellow 112 none mascu… #> 3 R2-D2 96 32 <NA> white, bl… red 33 none mascu… #> 4 Darth V… 202 136 none white yellow 41.9 male mascu… #> 5 Leia Or… 150 49 brown light brown 19 fema… femin… #> 6 Owen La… 178 120 brown, gr… light blue 52 male mascu… #> 7 Beru Wh… 165 75 brown light blue 47 fema… femin… #> 8 R5-D4 97 32 <NA> white, red red NA none mascu… #> 9 Biggs D… 183 84 black light brown 24 male mascu… #> 10 Obi-Wan… 182 77 auburn, w… fair blue-gray 57 male mascu… #> # ℹ 77 more rows #> # ℹ 5 more variables: homeworld <chr>, species <chr>, films <list<chr>>, #> # vehicles <list<chr>>, starships <list<chr>> df_bigish <- nanoarrow:::vec_gen(data.frame(x = character()), n = 1e6) write_ipc_stream(df_bigish, tf) # Wrapper because mmap is apparently not passed through from read_ipc_stream() # and this is pretty significant read_ipc_stream_wrap <- function(f, ..., mmap) { arrow::read_ipc_stream( arrow:::make_readable_file(f, mmap = mmap, random_access = FALSE), ... ) } tf_raw <- brio::read_file_raw(tf) # Slower than arrow for raw vector input because of C implementation, # which doesn't currently share the global buffer (just shares buffers # between columns within a single batch) bench::mark( nanoarrow = read_nanoarrow(tf_raw) |> collect_array_stream(), arrow = read_ipc_stream(buffer(tf_raw), as_data_frame = FALSE), check = FALSE ) #> # A tibble: 2 × 6 #> expression min median `itr/sec` mem_alloc `gc/sec` #> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> #> 1 nanoarrow 1.27ms 1.84ms 439. 41.5KB 2.18 #> 2 arrow 509.26µs 528.65µs 1821. 3.6MB 79.3 # Slower than arrow, maybe because of C implementation, but definitely # because it uses base::readBin() which necessiates an extra copy bench::mark( nanoarrow = read_nanoarrow(tf) |> collect_array_stream(), arrow_mmap = read_ipc_stream_wrap(tf, mmap = TRUE, as_data_frame = FALSE), arrow = read_ipc_stream_wrap(tf, mmap = FALSE, as_data_frame = FALSE), check = FALSE ) #> # A tibble: 3 × 6 #> expression min median `itr/sec` mem_alloc `gc/sec` #> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> #> 1 nanoarrow 5.18ms 5.66ms 174. 16.2MB 189. #> 2 arrow_mmap 613.48µs 640.5µs 1526. 528.6KB 13.9 #> 3 arrow 2.18ms 2.84ms 339. 551.6KB 4.06 ``` <sup>Created on 2024-02-19 with [reprex v2.0.2](https://reprex.tidyverse.org)</sup>
1 parent c66ddc3 commit e4f0754

File tree

17 files changed

+797
-54
lines changed

17 files changed

+797
-54
lines changed

.github/workflows/r-check.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ jobs:
5151
R_KEEP_PKG_SOURCE: yes
5252

5353
steps:
54-
- uses: actions/checkout@v2
54+
- uses: actions/checkout@v4
5555

5656
- uses: r-lib/actions/setup-pandoc@v2
5757
- uses: r-lib/actions/setup-r@v2
@@ -66,14 +66,14 @@ jobs:
6666
if: matrix.config.os != 'windows-latest'
6767
env:
6868
PKG_CPPFLAGS: "-DNANOARROW_DEBUG"
69-
PKG_CFLAGS: "-Werror -Wall -Wextra -Wpedantic -Wconversion -Wno-unused-parameter -Wno-sign-conversion -Wno-cast-function-type"
69+
PKG_CFLAGS: "-Werror -Wall -Wextra -Wpedantic -Wconversion -Wno-unused-parameter -Wno-sign-conversion -Wno-cast-function-type -Wno-misleading-indentation -Wno-conversion -Wno-unused-const-variable"
7070
run: |
7171
R CMD INSTALL r --preclean
7272
shell: bash
7373

7474
- uses: r-lib/actions/setup-r-dependencies@v2
7575
with:
76-
extra-packages: any::rcmdcheck, arrow=?ignore-before-r=4.0.0, github::r-lib/[email protected]
76+
extra-packages: any::rcmdcheck, arrow=?ignore-before-r=4.0.0
7777
needs: check
7878
working-directory: r
7979

extensions/nanoarrow_ipc/CMakeLists.txt

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,35 @@ else()
170170

171171
endif()
172172

173+
# Don't add extra warning flags when bundling, since we treat flatcc
174+
# as a part of the nanoarrow_ipc target and we have no control over the
175+
# warnings it produces.
176+
if(CMAKE_BUILD_TYPE STREQUAL "Debug" AND NOT NANOARROW_IPC_BUNDLE)
177+
if(CMAKE_C_COMPILER_ID STREQUAL "GNU")
178+
target_compile_options(nanoarrow_ipc
179+
PRIVATE -Wall
180+
-Werror
181+
-Wextra
182+
-Wpedantic
183+
-Wno-type-limits
184+
-Wmaybe-uninitialized
185+
-Wunused-result
186+
-Wconversion
187+
-Wno-sign-conversion
188+
-Wno-misleading-indentation)
189+
elseif(CMAKE_C_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_C_COMPILER_ID STREQUAL
190+
"Clang")
191+
target_compile_options(nanoarrow_ipc
192+
PRIVATE -Wall
193+
-Werror
194+
-Wextra
195+
-Wpedantic
196+
-Wdocumentation
197+
-Wconversion
198+
-Wno-sign-conversion)
199+
endif()
200+
endif()
201+
173202
if(NANOARROW_IPC_BUILD_TESTS)
174203
set(MEMORYCHECK_COMMAND_OPTIONS
175204
"--leak-check=full --suppressions=${CMAKE_CURRENT_LIST_DIR}/../../valgrind.supp --error-exitcode=1"

extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_decoder.c

Lines changed: 27 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -46,9 +46,14 @@
4646
#include "nanoarrow_ipc.h"
4747
#include "nanoarrow_ipc_flatcc_generated.h"
4848

49+
// R 3.6 / Windows builds on a very old toolchain that does not define ENODATA
50+
#if defined(_WIN32) && !defined(_MSC_VER) && !defined(ENODATA)
51+
#define ENODATA 120
52+
#endif
53+
4954
// A more readable expression way to refer to the fact that there are 8 bytes
5055
// at the beginning of every message header.
51-
const static int64_t kMessageHeaderPrefixSize = 8;
56+
static const int32_t kMessageHeaderPrefixSize = 8;
5257

5358
// Internal representation of a parsed "Field" from flatbuffers. This
5459
// represents a field in a depth-first walk of column arrays and their
@@ -155,6 +160,10 @@ int ArrowIpcSharedBufferIsThreadSafe(void) { return 0; }
155160

156161
static void ArrowIpcSharedBufferFree(struct ArrowBufferAllocator* allocator, uint8_t* ptr,
157162
int64_t size) {
163+
NANOARROW_UNUSED(allocator);
164+
NANOARROW_UNUSED(ptr);
165+
NANOARROW_UNUSED(size);
166+
158167
struct ArrowIpcSharedBufferPrivate* private_data =
159168
(struct ArrowIpcSharedBufferPrivate*)allocator->private_data;
160169

@@ -455,8 +464,10 @@ static int ArrowIpcDecoderSetTypeFixedSizeBinary(struct ArrowSchema* schema,
455464
struct ArrowError* error) {
456465
ns(FixedSizeBinary_table_t) type = (ns(FixedSizeBinary_table_t))type_generic;
457466
int fixed_size = ns(FixedSizeBinary_byteWidth(type));
458-
return ArrowSchemaSetTypeFixedSize(schema, NANOARROW_TYPE_FIXED_SIZE_BINARY,
459-
fixed_size);
467+
NANOARROW_RETURN_NOT_OK_WITH_ERROR(
468+
ArrowSchemaSetTypeFixedSize(schema, NANOARROW_TYPE_FIXED_SIZE_BINARY, fixed_size),
469+
error);
470+
return NANOARROW_OK;
460471
}
461472

462473
static int ArrowIpcDecoderSetTypeDate(struct ArrowSchema* schema,
@@ -488,7 +499,7 @@ static int ArrowIpcDecoderSetTypeTime(struct ArrowSchema* schema,
488499
case ns(TimeUnit_MILLISECOND):
489500
if (bitwidth != 32) {
490501
ArrowErrorSet(error, "Expected bitwidth of 32 for Time TimeUnit %s but found %d",
491-
ns(TimeUnit_name(time_unit)), bitwidth);
502+
ns(TimeUnit_name(ns(Time_unit(type)))), bitwidth);
492503
return EINVAL;
493504
}
494505

@@ -499,7 +510,7 @@ static int ArrowIpcDecoderSetTypeTime(struct ArrowSchema* schema,
499510
case ns(TimeUnit_NANOSECOND):
500511
if (bitwidth != 64) {
501512
ArrowErrorSet(error, "Expected bitwidth of 64 for Time TimeUnit %s but found %d",
502-
ns(TimeUnit_name(time_unit)), bitwidth);
513+
ns(TimeUnit_name(ns(Time_unit(type)))), bitwidth);
503514
return EINVAL;
504515
}
505516

@@ -644,7 +655,6 @@ static int ArrowIpcDecoderSetTypeUnion(struct ArrowSchema* schema,
644655
int format_out_size = sizeof(union_types_str);
645656
int n_chars = 0;
646657

647-
const char* format_prefix;
648658
switch (union_mode) {
649659
case ns(UnionMode_Sparse):
650660
n_chars = snprintf(format_cursor, format_out_size, "+us:");
@@ -826,9 +836,6 @@ static int ArrowIpcDecoderSetChildren(struct ArrowSchema* schema, ns(Field_vec_t
826836
static int ArrowIpcDecoderDecodeSchemaHeader(struct ArrowIpcDecoder* decoder,
827837
flatbuffers_generic_t message_header,
828838
struct ArrowError* error) {
829-
struct ArrowIpcDecoderPrivate* private_data =
830-
(struct ArrowIpcDecoderPrivate*)decoder->private_data;
831-
832839
ns(Schema_table_t) schema = (ns(Schema_table_t))message_header;
833840
int endianness = ns(Schema_endianness(schema));
834841
switch (endianness) {
@@ -977,9 +984,6 @@ static inline int ArrowIpcDecoderReadHeaderPrefix(struct ArrowIpcDecoder* decode
977984
ArrowErrorCode ArrowIpcDecoderPeekHeader(struct ArrowIpcDecoder* decoder,
978985
struct ArrowBufferView data,
979986
struct ArrowError* error) {
980-
struct ArrowIpcDecoderPrivate* private_data =
981-
(struct ArrowIpcDecoderPrivate*)decoder->private_data;
982-
983987
ArrowIpcDecoderResetHeaderInfo(decoder);
984988
NANOARROW_RETURN_NOT_OK(ArrowIpcDecoderReadHeaderPrefix(
985989
decoder, &data, &decoder->header_size_bytes, error));
@@ -1051,7 +1055,7 @@ ArrowErrorCode ArrowIpcDecoderDecodeHeader(struct ArrowIpcDecoder* decoder,
10511055
}
10521056

10531057
// Read some basic information from the message
1054-
int32_t metadata_version = ns(Message_version(message));
1058+
decoder->metadata_version = ns(Message_version(message));
10551059
decoder->message_type = ns(Message_header_type(message));
10561060
decoder->body_size_bytes = ns(Message_bodyLength(message));
10571061

@@ -1063,7 +1067,7 @@ ArrowErrorCode ArrowIpcDecoderDecodeHeader(struct ArrowIpcDecoder* decoder,
10631067
case ns(MetadataVersion_V3):
10641068
case ns(MetadataVersion_V4):
10651069
ArrowErrorSet(error, "Expected metadata version V5 but found %s",
1066-
ns(MetadataVersion_name(decoder->metadata_version)));
1070+
ns(MetadataVersion_name(ns(Message_version(message)))));
10671071
break;
10681072
default:
10691073
ArrowErrorSet(error, "Unexpected value for Message metadata version (%d)",
@@ -1085,7 +1089,7 @@ ArrowErrorCode ArrowIpcDecoderDecodeHeader(struct ArrowIpcDecoder* decoder,
10851089
case ns(MessageHeader_Tensor):
10861090
case ns(MessageHeader_SparseTensor):
10871091
ArrowErrorSet(error, "Unsupported message type: '%s'",
1088-
ns(MessageHeader_type_name(decoder->message_type)));
1092+
ns(MessageHeader_type_name(ns(Message_header_type(message)))));
10891093
return ENOTSUP;
10901094
default:
10911095
ArrowErrorSet(error, "Unknown message type: %d", (int)(decoder->message_type));
@@ -1245,7 +1249,7 @@ struct ArrowIpcBufferSource {
12451249
int64_t buffer_length_bytes;
12461250
enum ArrowIpcCompressionType codec;
12471251
enum ArrowType data_type;
1248-
int32_t element_size_bits;
1252+
int64_t element_size_bits;
12491253
int swap_endian;
12501254
};
12511255

@@ -1284,6 +1288,10 @@ static ArrowErrorCode ArrowIpcMakeBufferFromView(struct ArrowIpcBufferFactory* f
12841288
struct ArrowBufferView* dst_view,
12851289
struct ArrowBuffer* dst,
12861290
struct ArrowError* error) {
1291+
NANOARROW_UNUSED(factory);
1292+
NANOARROW_UNUSED(dst);
1293+
NANOARROW_UNUSED(error);
1294+
12871295
struct ArrowBufferView* body = (struct ArrowBufferView*)factory->private_data;
12881296
dst_view->data.as_uint8 = body->data.as_uint8 + src->body_offset_bytes;
12891297
dst_view->size_bytes = src->buffer_length_bytes;
@@ -1303,6 +1311,8 @@ static ArrowErrorCode ArrowIpcMakeBufferFromShared(struct ArrowIpcBufferFactory*
13031311
struct ArrowBufferView* dst_view,
13041312
struct ArrowBuffer* dst,
13051313
struct ArrowError* error) {
1314+
NANOARROW_UNUSED(error);
1315+
13061316
struct ArrowIpcSharedBuffer* shared =
13071317
(struct ArrowIpcSharedBuffer*)factory->private_data;
13081318
ArrowBufferReset(dst);
@@ -1364,7 +1374,7 @@ static int ArrowIpcDecoderSwapEndian(struct ArrowIpcBufferSource* src,
13641374
const uint64_t* ptr_src = out_view->data.as_uint64;
13651375
uint64_t* ptr_dst = (uint64_t*)dst->data;
13661376
uint64_t words[4];
1367-
int n_words = src->element_size_bits / 64;
1377+
int n_words = (int)(src->element_size_bits / 64);
13681378

13691379
for (int64_t i = 0; i < (dst->size_bytes / n_words / 8); i++) {
13701380
for (int j = 0; j < n_words; j++) {

extensions/nanoarrow_ipc/src/nanoarrow/nanoarrow_ipc_reader.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,11 @@
2222
#include "nanoarrow.h"
2323
#include "nanoarrow_ipc.h"
2424

25+
// R 3.6 / Windows builds on a very old toolchain that does not define ENODATA
26+
#if defined(_WIN32) && !defined(_MSC_VER) && !defined(ENODATA)
27+
#define ENODATA 120
28+
#endif
29+
2530
void ArrowIpcInputStreamMove(struct ArrowIpcInputStream* src,
2631
struct ArrowIpcInputStream* dst) {
2732
memcpy(dst, src, sizeof(struct ArrowIpcInputStream));
@@ -37,6 +42,8 @@ static ArrowErrorCode ArrowIpcInputStreamBufferRead(struct ArrowIpcInputStream*
3742
uint8_t* buf, int64_t buf_size_bytes,
3843
int64_t* size_read_out,
3944
struct ArrowError* error) {
45+
NANOARROW_UNUSED(error);
46+
4047
if (buf_size_bytes == 0) {
4148
*size_read_out = 0;
4249
return NANOARROW_OK;

r/.covrignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,3 +17,6 @@
1717

1818
src/nanoarrow.c
1919
src/nanoarrow.h
20+
src/nanoarrow_ipc.h
21+
src/nanoarrow_ipc.c
22+
src/flatcc*

r/NAMESPACE

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,9 @@ S3method(print,nanoarrow_array)
108108
S3method(print,nanoarrow_array_stream)
109109
S3method(print,nanoarrow_buffer)
110110
S3method(print,nanoarrow_schema)
111+
S3method(read_nanoarrow,character)
112+
S3method(read_nanoarrow,connection)
113+
S3method(read_nanoarrow,raw)
111114
S3method(str,nanoarrow_array)
112115
S3method(str,nanoarrow_array_stream)
113116
S3method(str,nanoarrow_buffer)
@@ -124,6 +127,7 @@ export(convert_array)
124127
export(convert_array_extension)
125128
export(convert_array_stream)
126129
export(convert_buffer)
130+
export(example_ipc_stream)
127131
export(infer_nanoarrow_ptype)
128132
export(infer_nanoarrow_ptype_extension)
129133
export(infer_nanoarrow_schema)
@@ -188,6 +192,7 @@ export(nanoarrow_pointer_set_protected)
188192
export(nanoarrow_schema_modify)
189193
export(nanoarrow_schema_parse)
190194
export(nanoarrow_version)
195+
export(read_nanoarrow)
191196
export(register_nanoarrow_extension)
192197
export(resolve_nanoarrow_extension)
193198
export(unregister_nanoarrow_extension)

0 commit comments

Comments
 (0)