Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor(r): Refactor ArrowArray(Stream) -> R Vector conversion #392

Draft
wants to merge 36 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
7a90fe1
rename nanoarow_cpp to reflect purpose
paleolimbot Feb 24, 2024
6605e7b
start the vctr builder
paleolimbot Feb 25, 2024
d145da8
sketch
paleolimbot Feb 25, 2024
4a1983f
clean up dispatch
paleolimbot Feb 26, 2024
e1b6e42
a little bit more dispatch
paleolimbot Feb 26, 2024
b1e5d90
maybe fix compile
paleolimbot Feb 27, 2024
2bec5d3
some dispatch
paleolimbot Feb 28, 2024
bbf46d7
start migrating infer_ptype
paleolimbot Feb 28, 2024
3de01c7
with data.frame
paleolimbot Feb 28, 2024
d28efa7
with passing tests for infer
paleolimbot Feb 28, 2024
42b1b85
remove ptype bits
paleolimbot Feb 28, 2024
15f2e17
start to split out builder classes
paleolimbot Feb 28, 2024
04f56cd
split out classes into smaller files
paleolimbot Feb 28, 2024
5e7df94
first conversion
paleolimbot Feb 28, 2024
c1e20ac
add an array view
paleolimbot Mar 1, 2024
78d3d81
maybe have conversion errors
paleolimbot Mar 1, 2024
7e35f5d
maybe some actual conversions
paleolimbot Mar 1, 2024
d269d8b
fix can't convert
paleolimbot Mar 1, 2024
a931c5b
working for integers
paleolimbot Mar 1, 2024
9e38a65
shuffle
paleolimbot Mar 6, 2024
2688e28
add double impl
paleolimbot Mar 6, 2024
24d301b
get logical conversion ported
paleolimbot Mar 6, 2024
1782799
port int64 to new class
paleolimbot Mar 6, 2024
e907d6b
fix lossy convert
paleolimbot Mar 6, 2024
14b9ef5
format
paleolimbot Mar 6, 2024
86a681d
wire up chr
paleolimbot Mar 10, 2024
c9b8720
wire up blob
paleolimbot Mar 10, 2024
cb88afe
wire up date converter
paleolimbot Mar 10, 2024
f439865
wire up more converters
paleolimbot Mar 10, 2024
90c8a0d
wire up posixct
paleolimbot Mar 10, 2024
dcd08a9
start on the call-into-r
paleolimbot Mar 14, 2024
a5a9b6f
always route extension types and dictionaires through the Other route
paleolimbot Mar 15, 2024
25b7b88
sketch "other" support
paleolimbot Mar 15, 2024
873fd8e
pass optional ownership info
paleolimbot Mar 15, 2024
16225e2
prototype method
paleolimbot Mar 15, 2024
15b38b6
fix init
paleolimbot Jul 4, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions r/src/buffer.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

#include "buffer.h"
#include "nanoarrow.h"
#include "preserve.h"

void finalize_buffer_xptr(SEXP buffer_xptr) {
struct ArrowBuffer* buffer = (struct ArrowBuffer*)R_ExternalPtrAddr(buffer_xptr);
Expand Down
1 change: 1 addition & 0 deletions r/src/buffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include <Rinternals.h>

#include "nanoarrow.h"
#include "preserve.h"
#include "util.h"

void finalize_buffer_xptr(SEXP buffer_xptr);
Expand Down
68 changes: 2 additions & 66 deletions r/src/infer_ptype.c
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
// that later warns for out-of-range values (e.g., int64 to double());
// however, a user can use the convert_array(x, ptype = something_safer())
// when this occurs.
enum VectorType nanoarrow_infer_vector_type(enum ArrowType type) {
static enum VectorType nanoarrow_infer_vector_type(enum ArrowType type) {
switch (type) {
case NANOARROW_TYPE_BOOL:
return VECTOR_TYPE_LGL;
Expand Down Expand Up @@ -69,7 +69,7 @@ enum VectorType nanoarrow_infer_vector_type(enum ArrowType type) {
}

// The same as the above, but from a nanoarrow_schema()
enum VectorType nanoarrow_infer_vector_type_schema(SEXP schema_xptr) {
static enum VectorType nanoarrow_infer_vector_type_schema(SEXP schema_xptr) {
struct ArrowSchema* schema = nanoarrow_schema_from_xptr(schema_xptr);

struct ArrowSchemaView schema_view;
Expand All @@ -89,67 +89,3 @@ enum VectorType nanoarrow_infer_vector_type_schema(SEXP schema_xptr) {
enum VectorType nanoarrow_infer_vector_type_array(SEXP array_xptr) {
return nanoarrow_infer_vector_type_schema(array_xptr_get_schema(array_xptr));
}

// Call nanoarrow::infer_ptype_other(), which handles less common types that
// are easier to compute in R or gives an informative error if this is
// not possible.
static SEXP call_infer_ptype_other(SEXP schema_xptr) {
SEXP fun = PROTECT(Rf_install("infer_ptype_other"));
SEXP call = PROTECT(Rf_lang2(fun, schema_xptr));
SEXP result = PROTECT(Rf_eval(call, nanoarrow_ns_pkg));
UNPROTECT(3);
return result;
}

SEXP nanoarrow_c_infer_ptype(SEXP schema_xptr);

static SEXP infer_ptype_data_frame(SEXP schema_xptr) {
struct ArrowSchema* schema = nanoarrow_schema_from_xptr(schema_xptr);
SEXP result = PROTECT(Rf_allocVector(VECSXP, schema->n_children));
SEXP result_names = PROTECT(Rf_allocVector(STRSXP, schema->n_children));

for (R_xlen_t i = 0; i < schema->n_children; i++) {
SEXP child_xptr = PROTECT(borrow_schema_child_xptr(schema_xptr, i));
SET_VECTOR_ELT(result, i, nanoarrow_c_infer_ptype(child_xptr));
UNPROTECT(1);

struct ArrowSchema* child = schema->children[i];
if (child->name != NULL) {
SET_STRING_ELT(result_names, i, Rf_mkCharCE(child->name, CE_UTF8));
} else {
SET_STRING_ELT(result_names, i, Rf_mkChar(""));
}
}

Rf_setAttrib(result, R_ClassSymbol, nanoarrow_cls_data_frame);
Rf_setAttrib(result, R_NamesSymbol, result_names);
SEXP rownames = PROTECT(Rf_allocVector(INTSXP, 2));
INTEGER(rownames)[0] = NA_INTEGER;
INTEGER(rownames)[1] = 0;
Rf_setAttrib(result, R_RowNamesSymbol, rownames);
UNPROTECT(3);
return result;
}

SEXP nanoarrow_c_infer_ptype(SEXP schema_xptr) {
enum VectorType vector_type = nanoarrow_infer_vector_type_schema(schema_xptr);
SEXP ptype = R_NilValue;

switch (vector_type) {
case VECTOR_TYPE_LGL:
case VECTOR_TYPE_INT:
case VECTOR_TYPE_DBL:
case VECTOR_TYPE_CHR:
ptype = PROTECT(nanoarrow_alloc_type(vector_type, 0));
break;
case VECTOR_TYPE_DATA_FRAME:
ptype = PROTECT(infer_ptype_data_frame(schema_xptr));
break;
default:
ptype = PROTECT(call_infer_ptype_other(schema_xptr));
break;
}

UNPROTECT(1);
return ptype;
}
3 changes: 3 additions & 0 deletions r/src/init.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include <Rinternals.h>

#include "altrep.h"
#include "preserve.h"
#include "util.h"

/* generated by tools/make-callentries.R */
Expand Down Expand Up @@ -91,6 +92,7 @@ extern SEXP nanoarrow_c_schema_set_dictionary(SEXP schema_mut_xptr, SEXP diction
extern SEXP nanoarrow_c_preserved_count(void);
extern SEXP nanoarrow_c_preserved_empty(void);
extern SEXP nanoarrow_c_preserve_and_release_on_other_thread(SEXP obj);
extern SEXP nanoarrow_c_convert_array2(SEXP array_xptr, SEXP ptype_sexp);
extern SEXP nanoarrow_c_vctr_chunk_offsets(SEXP array_list);
extern SEXP nanoarrow_c_vctr_chunk_resolve(SEXP indices_sexp, SEXP offsets_sexp);
extern SEXP nanoarrow_c_vctr_as_slice(SEXP indices_sexp);
Expand Down Expand Up @@ -168,6 +170,7 @@ static const R_CallMethodDef CallEntries[] = {
{"nanoarrow_c_preserved_empty", (DL_FUNC)&nanoarrow_c_preserved_empty, 0},
{"nanoarrow_c_preserve_and_release_on_other_thread",
(DL_FUNC)&nanoarrow_c_preserve_and_release_on_other_thread, 1},
{"nanoarrow_c_convert_array2", (DL_FUNC)&nanoarrow_c_convert_array2, 2},
{"nanoarrow_c_vctr_chunk_offsets", (DL_FUNC)&nanoarrow_c_vctr_chunk_offsets, 1},
{"nanoarrow_c_vctr_chunk_resolve", (DL_FUNC)&nanoarrow_c_vctr_chunk_resolve, 2},
{"nanoarrow_c_vctr_as_slice", (DL_FUNC)&nanoarrow_c_vctr_as_slice, 1},
Expand Down
8 changes: 8 additions & 0 deletions r/src/materialize.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@

#include "materialize_common.h"

#ifdef __cplusplus
extern "C" {
#endif

// A heuristic to identify prototypes that should be treated like data frames
// (i.e., including record-style vectors like POSIXct). This heuristic returns
// true if ptype is a data.frame or is an S3 list with names.
Expand All @@ -46,4 +50,8 @@ SEXP nanoarrow_materialize_realloc(SEXP ptype, R_xlen_t len);
// nanoarrow_vctr conversion.
int nanoarrow_materialize_finalize_result(SEXP converter_xptr);

#ifdef __cplusplus
}
#endif

#endif
2 changes: 2 additions & 0 deletions r/src/materialize_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,14 @@
enum VectorType {
VECTOR_TYPE_UNINITIALIZED,
VECTOR_TYPE_NULL,
VECTOR_TYPE_RAW,
VECTOR_TYPE_UNSPECIFIED,
VECTOR_TYPE_LGL,
VECTOR_TYPE_INT,
VECTOR_TYPE_DBL,
VECTOR_TYPE_ALTREP_CHR,
VECTOR_TYPE_CHR,
VECTOR_TYPE_HMS,
VECTOR_TYPE_POSIXCT,
VECTOR_TYPE_DATE,
VECTOR_TYPE_DIFFTIME,
Expand Down
4 changes: 1 addition & 3 deletions r/src/pointers.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,9 @@

#include "array.h"
#include "array_stream.h"
#include "preserve.h"
#include "schema.h"

// More reliable way to stringify intptr_t on Windows using C++
void intptr_as_string(intptr_t ptr_int, char* buf);

SEXP nanoarrow_c_allocate_schema(void) { return nanoarrow_schema_owning_xptr(); }

SEXP nanoarrow_c_allocate_array(void) { return nanoarrow_array_owning_xptr(); }
Expand Down
18 changes: 10 additions & 8 deletions r/src/nanoarrow_cpp.cc → r/src/preserve.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
#include <thread>
#include <vector>

#include "preserve.h"

// Without this infrastructure, it's possible to check that all objects
// are released by running devtools::test(); gc() in a fresh session and
// making sure that nanoarrow:::preserved_count() is zero afterward.
Expand All @@ -35,7 +37,7 @@
#include <unordered_map>
#endif

extern "C" void intptr_as_string(intptr_t ptr_int, char* buf) {
void intptr_as_string(intptr_t ptr_int, char* buf) {
std::string ptr_str = std::to_string(ptr_int);
memcpy(buf, ptr_str.data(), ptr_str.size());
}
Expand Down Expand Up @@ -166,37 +168,37 @@ class PreservedSEXPRegistry {
#endif
};

extern "C" void nanoarrow_preserve_init(void) { PreservedSEXPRegistry::GetInstance(); }
void nanoarrow_preserve_init(void) { PreservedSEXPRegistry::GetInstance(); }

extern "C" void nanoarrow_preserve_sexp(SEXP obj) {
void nanoarrow_preserve_sexp(SEXP obj) {
PreservedSEXPRegistry::GetInstance().preserve(obj);
}

extern "C" void nanoarrow_release_sexp(SEXP obj) {
void nanoarrow_release_sexp(SEXP obj) {
try {
PreservedSEXPRegistry::GetInstance().release(obj);
} catch (std::exception& e) {
// Just for safety...we really don't want to crash here
}
}

extern "C" int64_t nanoarrow_preserved_count(void) {
int64_t nanoarrow_preserved_count(void) {
return PreservedSEXPRegistry::GetInstance().size();
}

extern "C" int64_t nanoarrow_preserved_empty(void) {
int64_t nanoarrow_preserved_empty(void) {
try {
return PreservedSEXPRegistry::GetInstance().empty_trash();
} catch (std::exception& e) {
return 0;
}
}

extern "C" int nanoarrow_is_main_thread(void) {
int nanoarrow_is_main_thread(void) {
return PreservedSEXPRegistry::GetInstance().is_main_thread();
}

extern "C" void nanoarrow_preserve_and_release_on_other_thread(SEXP obj) {
void nanoarrow_preserve_and_release_on_other_thread(SEXP obj) {
nanoarrow_preserve_sexp(obj);
std::thread worker([obj] { nanoarrow_release_sexp(obj); });
worker.join();
Expand Down
52 changes: 52 additions & 0 deletions r/src/preserve.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#ifndef R_NANOARROW_PRESERVE_H_INCLUDED
#define R_NANOARROW_PRESERVE_H_INCLUDED

#define R_NO_REMAP
#include <R.h>
#include <Rinternals.h>

#include <stdint.h>

#ifdef __cplusplus
extern "C" {
#endif

// Not really related to preserve/release, but needs C++
void intptr_as_string(intptr_t ptr_int, char* buf);

// Internal abstractions for R_PreserveObject and R_ReleaseObject
// that provide an opportunity for debugging information about
// preserved object lifecycle and possible future optimizations.
// These implementations use C++ and live in nanoarrow_cpp.cc
void nanoarrow_preserve_init(void);
void nanoarrow_preserve_sexp(SEXP obj);
void nanoarrow_release_sexp(SEXP obj);
int64_t nanoarrow_preserved_count(void);
int64_t nanoarrow_preserved_empty(void);
int nanoarrow_is_main_thread(void);

// For testing
void nanoarrow_preserve_and_release_on_other_thread(SEXP obj);

#ifdef __cplusplus
}
#endif

#endif
1 change: 1 addition & 0 deletions r/src/util.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include <R.h>
#include <Rinternals.h>

#include "preserve.h"
#include "util.h"

SEXP nanoarrow_ns_pkg = NULL;
Expand Down
14 changes: 0 additions & 14 deletions r/src/util.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,20 +36,6 @@ extern SEXP nanoarrow_ptype_raw;

void nanoarrow_init_cached_sexps(void);

// Internal abstractions for R_PreserveObject and R_ReleaseObject
// that provide an opportunity for debugging information about
// preserved object lifecycle and possible future optimizations.
// These implementations use C++ and live in nanoarrow_cpp.cc
void nanoarrow_preserve_init(void);
void nanoarrow_preserve_sexp(SEXP obj);
void nanoarrow_release_sexp(SEXP obj);
int64_t nanoarrow_preserved_count(void);
int64_t nanoarrow_preserved_empty(void);
int nanoarrow_is_main_thread(void);

// For testing
void nanoarrow_preserve_and_release_on_other_thread(SEXP obj);

// Checker for very small mallocs()
static inline void check_trivial_alloc(const void* ptr, const char* ptr_type) {
if (ptr == NULL) {
Expand Down
Loading
Loading