From 7a90fe110f2aca7c58c6f8c50e586f93e99ae432 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Sat, 24 Feb 2024 19:04:59 -0400 Subject: [PATCH 01/36] rename nanoarow_cpp to reflect purpose --- r/src/buffer.c | 1 + r/src/buffer.h | 1 + r/src/init.c | 1 + r/src/pointers.c | 4 +-- r/src/{nanoarrow_cpp.cc => preserve.cc} | 18 +++++----- r/src/preserve.h | 45 +++++++++++++++++++++++++ r/src/util.h | 14 -------- 7 files changed, 59 insertions(+), 25 deletions(-) rename r/src/{nanoarrow_cpp.cc => preserve.cc} (91%) create mode 100644 r/src/preserve.h diff --git a/r/src/buffer.c b/r/src/buffer.c index 55e522288..f38148a16 100644 --- a/r/src/buffer.c +++ b/r/src/buffer.c @@ -23,6 +23,7 @@ #include "buffer.h" #include "nanoarrow.h" +#include "preserve.h" void finalize_buffer_xptr(SEXP buffer_xptr) { struct ArrowBuffer* buffer = (struct ArrowBuffer*)R_ExternalPtrAddr(buffer_xptr); diff --git a/r/src/buffer.h b/r/src/buffer.h index 2dcc49eb8..9d21443cf 100644 --- a/r/src/buffer.h +++ b/r/src/buffer.h @@ -22,6 +22,7 @@ #include #include "nanoarrow.h" +#include "preserve.h" #include "util.h" void finalize_buffer_xptr(SEXP buffer_xptr); diff --git a/r/src/init.c b/r/src/init.c index 69c943911..417da0f8e 100644 --- a/r/src/init.c +++ b/r/src/init.c @@ -20,6 +20,7 @@ #include #include "altrep.h" +#include "preserve.h" #include "util.h" /* generated by tools/make-callentries.R */ diff --git a/r/src/pointers.c b/r/src/pointers.c index 110029b8c..44f83a075 100644 --- a/r/src/pointers.c +++ b/r/src/pointers.c @@ -21,11 +21,9 @@ #include "array.h" #include "array_stream.h" +#include "preserve.h" #include "schema.h" -// More reliable way to stringify intptr_t on Windows using C++ -void intptr_as_string(intptr_t ptr_int, char* buf); - SEXP nanoarrow_c_allocate_schema(void) { return nanoarrow_schema_owning_xptr(); } SEXP nanoarrow_c_allocate_array(void) { return nanoarrow_array_owning_xptr(); } diff --git a/r/src/nanoarrow_cpp.cc b/r/src/preserve.cc similarity index 91% rename from r/src/nanoarrow_cpp.cc rename to r/src/preserve.cc index 9c0e38d68..7e62faf3a 100644 --- a/r/src/nanoarrow_cpp.cc +++ b/r/src/preserve.cc @@ -26,6 +26,8 @@ #include #include +#include "preserve.h" + // Without this infrastructure, it's possible to check that all objects // are released by running devtools::test(); gc() in a fresh session and // making sure that nanoarrow:::preserved_count() is zero afterward. @@ -35,7 +37,7 @@ #include #endif -extern "C" void intptr_as_string(intptr_t ptr_int, char* buf) { +void intptr_as_string(intptr_t ptr_int, char* buf) { std::string ptr_str = std::to_string(ptr_int); memcpy(buf, ptr_str.data(), ptr_str.size()); } @@ -166,13 +168,13 @@ class PreservedSEXPRegistry { #endif }; -extern "C" void nanoarrow_preserve_init(void) { PreservedSEXPRegistry::GetInstance(); } +void nanoarrow_preserve_init(void) { PreservedSEXPRegistry::GetInstance(); } -extern "C" void nanoarrow_preserve_sexp(SEXP obj) { +void nanoarrow_preserve_sexp(SEXP obj) { PreservedSEXPRegistry::GetInstance().preserve(obj); } -extern "C" void nanoarrow_release_sexp(SEXP obj) { +void nanoarrow_release_sexp(SEXP obj) { try { PreservedSEXPRegistry::GetInstance().release(obj); } catch (std::exception& e) { @@ -180,11 +182,11 @@ extern "C" void nanoarrow_release_sexp(SEXP obj) { } } -extern "C" int64_t nanoarrow_preserved_count(void) { +int64_t nanoarrow_preserved_count(void) { return PreservedSEXPRegistry::GetInstance().size(); } -extern "C" int64_t nanoarrow_preserved_empty(void) { +int64_t nanoarrow_preserved_empty(void) { try { return PreservedSEXPRegistry::GetInstance().empty_trash(); } catch (std::exception& e) { @@ -192,11 +194,11 @@ extern "C" int64_t nanoarrow_preserved_empty(void) { } } -extern "C" int nanoarrow_is_main_thread(void) { +int nanoarrow_is_main_thread(void) { return PreservedSEXPRegistry::GetInstance().is_main_thread(); } -extern "C" void nanoarrow_preserve_and_release_on_other_thread(SEXP obj) { +void nanoarrow_preserve_and_release_on_other_thread(SEXP obj) { nanoarrow_preserve_sexp(obj); std::thread worker([obj] { nanoarrow_release_sexp(obj); }); worker.join(); diff --git a/r/src/preserve.h b/r/src/preserve.h new file mode 100644 index 000000000..9f8a83176 --- /dev/null +++ b/r/src/preserve.h @@ -0,0 +1,45 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#define R_NO_REMAP +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +// Not really related to preserve/release, but needs C++ +void intptr_as_string(intptr_t ptr_int, char* buf); + +// Internal abstractions for R_PreserveObject and R_ReleaseObject +// that provide an opportunity for debugging information about +// preserved object lifecycle and possible future optimizations. +// These implementations use C++ and live in nanoarrow_cpp.cc +void nanoarrow_preserve_init(void); +void nanoarrow_preserve_sexp(SEXP obj); +void nanoarrow_release_sexp(SEXP obj); +int64_t nanoarrow_preserved_count(void); +int64_t nanoarrow_preserved_empty(void); +int nanoarrow_is_main_thread(void); + +// For testing +void nanoarrow_preserve_and_release_on_other_thread(SEXP obj); + +#ifdef __cplusplus +} +#endif diff --git a/r/src/util.h b/r/src/util.h index d652330ed..14a37333a 100644 --- a/r/src/util.h +++ b/r/src/util.h @@ -36,20 +36,6 @@ extern SEXP nanoarrow_ptype_raw; void nanoarrow_init_cached_sexps(void); -// Internal abstractions for R_PreserveObject and R_ReleaseObject -// that provide an opportunity for debugging information about -// preserved object lifecycle and possible future optimizations. -// These implementations use C++ and live in nanoarrow_cpp.cc -void nanoarrow_preserve_init(void); -void nanoarrow_preserve_sexp(SEXP obj); -void nanoarrow_release_sexp(SEXP obj); -int64_t nanoarrow_preserved_count(void); -int64_t nanoarrow_preserved_empty(void); -int nanoarrow_is_main_thread(void); - -// For testing -void nanoarrow_preserve_and_release_on_other_thread(SEXP obj); - // Checker for very small mallocs() static inline void check_trivial_alloc(const void* ptr, const char* ptr_type) { if (ptr == NULL) { From 6605e7b03770374df58d354e864dc3890bdb9414 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Sat, 24 Feb 2024 22:33:15 -0400 Subject: [PATCH 02/36] start the vctr builder --- r/src/materialize.h | 8 ++ r/src/preserve.h | 5 ++ r/src/util.c | 1 + r/src/vctr_builder.cc | 191 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 205 insertions(+) create mode 100644 r/src/vctr_builder.cc diff --git a/r/src/materialize.h b/r/src/materialize.h index c3b2c5cbd..08e1e27cb 100644 --- a/r/src/materialize.h +++ b/r/src/materialize.h @@ -23,6 +23,10 @@ #include "materialize_common.h" +#ifdef __cplusplus +extern "C" { +#endif + // A heuristic to identify prototypes that should be treated like data frames // (i.e., including record-style vectors like POSIXct). This heuristic returns // true if ptype is a data.frame or is an S3 list with names. @@ -46,4 +50,8 @@ SEXP nanoarrow_materialize_realloc(SEXP ptype, R_xlen_t len); // nanoarrow_vctr conversion. int nanoarrow_materialize_finalize_result(SEXP converter_xptr); +#ifdef __cplusplus +} +#endif + #endif diff --git a/r/src/preserve.h b/r/src/preserve.h index 9f8a83176..7c049d968 100644 --- a/r/src/preserve.h +++ b/r/src/preserve.h @@ -15,6 +15,9 @@ // specific language governing permissions and limitations // under the License. +#ifndef R_NANOARROW_PRESERVE_H_INCLUDED +#define R_NANOARROW_PRESERVE_H_INCLUDED + #define R_NO_REMAP #include #include @@ -43,3 +46,5 @@ void nanoarrow_preserve_and_release_on_other_thread(SEXP obj); #ifdef __cplusplus } #endif + +#endif diff --git a/r/src/util.c b/r/src/util.c index 6d4035ba6..6ef6d1669 100644 --- a/r/src/util.c +++ b/r/src/util.c @@ -19,6 +19,7 @@ #include #include +#include "preserve.h" #include "util.h" SEXP nanoarrow_ns_pkg = NULL; diff --git a/r/src/vctr_builder.cc b/r/src/vctr_builder.cc new file mode 100644 index 000000000..464535577 --- /dev/null +++ b/r/src/vctr_builder.cc @@ -0,0 +1,191 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#define R_NO_REMAP +#include +#include + +#include "materialize.h" +#include "nanoarrow.h" +#include "nanoarrow/r.h" + +class VctrBuilder { + public: + class Options { + public: + int64_t num_items; + int use_altrep; + }; + + // If a ptype is supplied to a VctrBuilder, it must be supplied at construction + // and preserved until the value is no longer needed. + + // Enable generic containers like std::vector> + virtual ~VctrBuilder() {} + + // Initialize this instance with the information available to the resolver, or the + // information that was inferred. If using the default `to`, ptype may be R_NilValue + // with Options containing the inferred information. Calling this method may longjmp. + virtual ArrowErrorCode Init(const ArrowSchema* schema, const Options& options, + ArrowError* error) { + return ENOTSUP; + } + + // Push an array into this builder and do not take ownership of array. This is + // called when the caller cannot safely relinquish ownership of an array (e.g., + // convert_array()). Calling this method may longjmp. + virtual ArrowErrorCode PushNext(const ArrowArray* array, ArrowError* error) { + return ENOTSUP; + } + + // Push an array into this builder. The implementation may (but is not required) to take + // ownership. This is called when the caller can relinquish ownership (e.g., + // convert_array_stream()). Calling this method may longjmp. + virtual ArrowErrorCode PushNextOwning(ArrowArray* array, ArrowError* error) { + return PushNext(array, error); + } + + // Perform any final calculations required to calculate the return value. + // Calling this method may longjmp. + virtual ArrowErrorCode Finish(ArrowError* error) { return ENOTSUP; } + + // Extract the final value of the builder. Calling this method may longjmp. + virtual SEXP GetValue() { return R_NilValue; } +}; + +class IntBuilder : public VctrBuilder {}; +class DblBuilder : public VctrBuilder {}; +class ChrBuilder : public VctrBuilder {}; +class LglBuilder : public VctrBuilder {}; +class RcrdBuilder : public VctrBuilder { + public: + explicit RcrdBuilder(SEXP ptype_sexp) {} +}; +class UnspecifiedBuilder : public VctrBuilder {}; +class BlobBuilder : public VctrBuilder {}; +class ListOfBuilder : public VctrBuilder {}; +class DateBuilder : public VctrBuilder {}; +class HmsBuilder : public VctrBuilder {}; +class PosixctBuilder : public VctrBuilder {}; +class DifftimeBuilder : public VctrBuilder {}; +class Integer64Builder : public VctrBuilder {}; + +class ExtensionBuilder : public VctrBuilder { + public: + explicit ExtensionBuilder(SEXP ptype_sexp) {} +}; + +// Currently in infer_ptype.c +extern "C" enum VectorType nanoarrow_infer_vector_type(enum ArrowType type); +extern "C" SEXP nanoarrow_c_infer_ptype(SEXP schema_xptr); + +// Resolve a builder class +ArrowErrorCode InstantiateBuilder(const ArrowSchema* schema, SEXP ptype_sexp, + const VctrBuilder::Options* options, VctrBuilder** out, + ArrowError* error) { + // See if we can skip any ptype resolution at all + if (ptype_sexp == R_NilValue) { + ArrowSchemaView view; + NANOARROW_RETURN_NOT_OK(ArrowSchemaViewInit(&view, schema, error)); + + enum VectorType vector_type = nanoarrow_infer_vector_type(view.type); + switch (vector_type) { + case VECTOR_TYPE_LGL: + *out = new LglBuilder(); + return NANOARROW_OK; + case VECTOR_TYPE_INT: + *out = new IntBuilder(); + return NANOARROW_OK; + case VECTOR_TYPE_DBL: + *out = new DblBuilder(); + return NANOARROW_OK; + case VECTOR_TYPE_CHR: + *out = new LglBuilder(); + return NANOARROW_OK; + case VECTOR_TYPE_DATA_FRAME: + *out = new RcrdBuilder(R_NilValue); + return NANOARROW_OK; + default: + break; + } + + // Otherwise, resolve the ptype and use it (this will error for ptypes that can't be + // resolved) + SEXP schema_xptr = PROTECT( + R_MakeExternalPtr(const_cast(schema), R_NilValue, R_NilValue)); + Rf_setAttrib(schema_xptr, R_ClassSymbol, nanoarrow_cls_schema); + SEXP inferred_ptype_sexp = PROTECT(nanoarrow_c_infer_ptype(schema_xptr)); + int code = InstantiateBuilder(schema, inferred_ptype_sexp, options, out, error); + UNPROTECT(1); + return code; + } + + // Handle some S3 objects internally to avoid S3 dispatch (e.g., when looping over a + // data frame with a lot of columns) + if (Rf_isObject(ptype_sexp)) { + if (nanoarrow_ptype_is_data_frame(ptype_sexp)) { + *out = new RcrdBuilder(ptype_sexp); + return NANOARROW_OK; + } else if (Rf_inherits(ptype_sexp, "vctrs_unspecified")) { + *out = new UnspecifiedBuilder(); + return NANOARROW_OK; + } else if (Rf_inherits(ptype_sexp, "blob")) { + *out = new BlobBuilder(); + return NANOARROW_OK; + } else if (Rf_inherits(ptype_sexp, "Date")) { + *out = new DateBuilder(); + return NANOARROW_OK; + } else if (Rf_inherits(ptype_sexp, "hms")) { + *out = new HmsBuilder(); + return NANOARROW_OK; + } else if (Rf_inherits(ptype_sexp, "POSIXct")) { + *out = new PosixctBuilder(); + return NANOARROW_OK; + } else if (Rf_inherits(ptype_sexp, "difftime")) { + *out = new DifftimeBuilder(); + return NANOARROW_OK; + } else if (Rf_inherits(ptype_sexp, "integer64")) { + *out = new Integer64Builder(); + return NANOARROW_OK; + } else { + *out = new ExtensionBuilder(ptype_sexp); + return NANOARROW_OK; + } + } + + // If we're here, these are non-S3 objects + switch (TYPEOF(ptype_sexp)) { + case LGLSXP: + *out = new LglBuilder(); + return NANOARROW_OK; + case INTSXP: + *out = new IntBuilder(); + return NANOARROW_OK; + case REALSXP: + *out = new DblBuilder(); + return NANOARROW_OK; + case STRSXP: + *out = new ChrBuilder(); + return NANOARROW_OK; + default: + *out = new ExtensionBuilder(ptype_sexp); + return NANOARROW_OK; + } + + *out = nullptr; + return ENOTSUP; +} From d145da812f684a5567c92c0c6685925f634ceec4 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Sat, 24 Feb 2024 22:41:07 -0400 Subject: [PATCH 03/36] sketch --- r/src/vctr_builder.cc | 39 ++++++++++++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 9 deletions(-) diff --git a/r/src/vctr_builder.cc b/r/src/vctr_builder.cc index 464535577..79fe82fb7 100644 --- a/r/src/vctr_builder.cc +++ b/r/src/vctr_builder.cc @@ -27,12 +27,12 @@ class VctrBuilder { public: class Options { public: - int64_t num_items; int use_altrep; }; // If a ptype is supplied to a VctrBuilder, it must be supplied at construction - // and preserved until the value is no longer needed. + // and preserved until the value is no longer needed. This is not an appropriate + // time to error. // Enable generic containers like std::vector> virtual ~VctrBuilder() {} @@ -61,27 +61,48 @@ class VctrBuilder { // Perform any final calculations required to calculate the return value. // Calling this method may longjmp. - virtual ArrowErrorCode Finish(ArrowError* error) { return ENOTSUP; } + virtual ArrowErrorCode Finish(ArrowError* error) { return NANOARROW_OK; } - // Extract the final value of the builder. Calling this method may longjmp. + // Release the final value of the builder. Calling this method may longjmp. virtual SEXP GetValue() { return R_NilValue; } }; class IntBuilder : public VctrBuilder {}; + class DblBuilder : public VctrBuilder {}; + class ChrBuilder : public VctrBuilder {}; + class LglBuilder : public VctrBuilder {}; + class RcrdBuilder : public VctrBuilder { public: explicit RcrdBuilder(SEXP ptype_sexp) {} }; + class UnspecifiedBuilder : public VctrBuilder {}; + class BlobBuilder : public VctrBuilder {}; -class ListOfBuilder : public VctrBuilder {}; + +class ListOfBuilder : public VctrBuilder { + public: + explicit ListOfBuilder(SEXP ptype_sexp) {} +}; + class DateBuilder : public VctrBuilder {}; + class HmsBuilder : public VctrBuilder {}; -class PosixctBuilder : public VctrBuilder {}; -class DifftimeBuilder : public VctrBuilder {}; + +class PosixctBuilder : public VctrBuilder { + public: + explicit PosixctBuilder(SEXP ptype_sexp) {} +}; + +class DifftimeBuilder : public VctrBuilder { + public: + explicit DifftimeBuilder(SEXP ptype_sexp) {} +}; + class Integer64Builder : public VctrBuilder {}; class ExtensionBuilder : public VctrBuilder { @@ -153,10 +174,10 @@ ArrowErrorCode InstantiateBuilder(const ArrowSchema* schema, SEXP ptype_sexp, *out = new HmsBuilder(); return NANOARROW_OK; } else if (Rf_inherits(ptype_sexp, "POSIXct")) { - *out = new PosixctBuilder(); + *out = new PosixctBuilder(ptype_sexp); return NANOARROW_OK; } else if (Rf_inherits(ptype_sexp, "difftime")) { - *out = new DifftimeBuilder(); + *out = new DifftimeBuilder(ptype_sexp); return NANOARROW_OK; } else if (Rf_inherits(ptype_sexp, "integer64")) { *out = new Integer64Builder(); From 4a1983f98e7ac4c562e5df1f13129f99c0bb7107 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 26 Feb 2024 17:45:21 -0400 Subject: [PATCH 04/36] clean up dispatch --- r/src/materialize_common.h | 1 + r/src/vctr_builder.cc | 129 +++++++++++++++++++++++-------------- r/src/vctr_builder.h | 57 ++++++++++++++++ 3 files changed, 137 insertions(+), 50 deletions(-) create mode 100644 r/src/vctr_builder.h diff --git a/r/src/materialize_common.h b/r/src/materialize_common.h index 6c811b615..480b4393f 100644 --- a/r/src/materialize_common.h +++ b/r/src/materialize_common.h @@ -37,6 +37,7 @@ enum VectorType { VECTOR_TYPE_DBL, VECTOR_TYPE_ALTREP_CHR, VECTOR_TYPE_CHR, + VECTOR_TYPE_HMS, VECTOR_TYPE_POSIXCT, VECTOR_TYPE_DATE, VECTOR_TYPE_DIFFTIME, diff --git a/r/src/vctr_builder.cc b/r/src/vctr_builder.cc index 79fe82fb7..999aa6381 100644 --- a/r/src/vctr_builder.cc +++ b/r/src/vctr_builder.cc @@ -23,13 +23,10 @@ #include "nanoarrow.h" #include "nanoarrow/r.h" -class VctrBuilder { - public: - class Options { - public: - int use_altrep; - }; +#include "vctr_builder.h" +struct VctrBuilder { + public: // If a ptype is supplied to a VctrBuilder, it must be supplied at construction // and preserved until the value is no longer needed. This is not an appropriate // time to error. @@ -40,7 +37,7 @@ class VctrBuilder { // Initialize this instance with the information available to the resolver, or the // information that was inferred. If using the default `to`, ptype may be R_NilValue // with Options containing the inferred information. Calling this method may longjmp. - virtual ArrowErrorCode Init(const ArrowSchema* schema, const Options& options, + virtual ArrowErrorCode Init(const ArrowSchema* schema, VctrBuilderOptions options, ArrowError* error) { return ENOTSUP; } @@ -114,9 +111,63 @@ class ExtensionBuilder : public VctrBuilder { extern "C" enum VectorType nanoarrow_infer_vector_type(enum ArrowType type); extern "C" SEXP nanoarrow_c_infer_ptype(SEXP schema_xptr); +static ArrowErrorCode InstantiateBuilderBase(const ArrowSchema* schema, + VectorType vector_type, SEXP ptype_sexp, + VctrBuilderOptions options, + VctrBuilder** out, ArrowError* error) { + switch (vector_type) { + case VECTOR_TYPE_NULL: + + case VECTOR_TYPE_LGL: + *out = new LglBuilder(); + return NANOARROW_OK; + case VECTOR_TYPE_INT: + *out = new IntBuilder(); + return NANOARROW_OK; + case VECTOR_TYPE_DBL: + *out = new DblBuilder(); + return NANOARROW_OK; + case VECTOR_TYPE_CHR: + *out = new LglBuilder(); + return NANOARROW_OK; + case VECTOR_TYPE_DATA_FRAME: + *out = new RcrdBuilder(ptype_sexp); + return NANOARROW_OK; + case VECTOR_TYPE_LIST_OF: + *out = new ListOfBuilder(ptype_sexp); + return NANOARROW_OK; + case VECTOR_TYPE_UNSPECIFIED: + *out = new UnspecifiedBuilder(); + return NANOARROW_OK; + case VECTOR_TYPE_BLOB: + *out = new BlobBuilder(); + return NANOARROW_OK; + case VECTOR_TYPE_DATE: + *out = new DateBuilder(); + return NANOARROW_OK; + case VECTOR_TYPE_HMS: + *out = new HmsBuilder(); + return NANOARROW_OK; + case VECTOR_TYPE_POSIXCT: + *out = new PosixctBuilder(ptype_sexp); + return NANOARROW_OK; + case VECTOR_TYPE_DIFFTIME: + *out = new DifftimeBuilder(ptype_sexp); + return NANOARROW_OK; + case VECTOR_TYPE_INTEGER64: + *out = new Integer64Builder(); + return NANOARROW_OK; + case VECTOR_TYPE_OTHER: + *out = new ExtensionBuilder(ptype_sexp); + return NANOARROW_OK; + default: + Rf_error("Unknown vector type id: %d", (int)vector_type); + } +} + // Resolve a builder class ArrowErrorCode InstantiateBuilder(const ArrowSchema* schema, SEXP ptype_sexp, - const VctrBuilder::Options* options, VctrBuilder** out, + VctrBuilderOptions options, VctrBuilder** out, ArrowError* error) { // See if we can skip any ptype resolution at all if (ptype_sexp == R_NilValue) { @@ -126,20 +177,12 @@ ArrowErrorCode InstantiateBuilder(const ArrowSchema* schema, SEXP ptype_sexp, enum VectorType vector_type = nanoarrow_infer_vector_type(view.type); switch (vector_type) { case VECTOR_TYPE_LGL: - *out = new LglBuilder(); - return NANOARROW_OK; case VECTOR_TYPE_INT: - *out = new IntBuilder(); - return NANOARROW_OK; case VECTOR_TYPE_DBL: - *out = new DblBuilder(); - return NANOARROW_OK; case VECTOR_TYPE_CHR: - *out = new LglBuilder(); - return NANOARROW_OK; case VECTOR_TYPE_DATA_FRAME: - *out = new RcrdBuilder(R_NilValue); - return NANOARROW_OK; + return InstantiateBuilderBase(schema, vector_type, R_NilValue, options, out, + error); default: break; } @@ -157,56 +200,42 @@ ArrowErrorCode InstantiateBuilder(const ArrowSchema* schema, SEXP ptype_sexp, // Handle some S3 objects internally to avoid S3 dispatch (e.g., when looping over a // data frame with a lot of columns) + enum VectorType vector_type = VECTOR_TYPE_OTHER; if (Rf_isObject(ptype_sexp)) { if (nanoarrow_ptype_is_data_frame(ptype_sexp)) { - *out = new RcrdBuilder(ptype_sexp); - return NANOARROW_OK; + vector_type = VECTOR_TYPE_DATA_FRAME; } else if (Rf_inherits(ptype_sexp, "vctrs_unspecified")) { - *out = new UnspecifiedBuilder(); - return NANOARROW_OK; + vector_type = VECTOR_TYPE_UNSPECIFIED; } else if (Rf_inherits(ptype_sexp, "blob")) { - *out = new BlobBuilder(); - return NANOARROW_OK; + vector_type = VECTOR_TYPE_BLOB; } else if (Rf_inherits(ptype_sexp, "Date")) { - *out = new DateBuilder(); - return NANOARROW_OK; + vector_type = VECTOR_TYPE_DATE; } else if (Rf_inherits(ptype_sexp, "hms")) { - *out = new HmsBuilder(); - return NANOARROW_OK; + vector_type = VECTOR_TYPE_HMS; } else if (Rf_inherits(ptype_sexp, "POSIXct")) { - *out = new PosixctBuilder(ptype_sexp); - return NANOARROW_OK; + vector_type = VECTOR_TYPE_POSIXCT; } else if (Rf_inherits(ptype_sexp, "difftime")) { - *out = new DifftimeBuilder(ptype_sexp); - return NANOARROW_OK; + vector_type = VECTOR_TYPE_DIFFTIME; } else if (Rf_inherits(ptype_sexp, "integer64")) { - *out = new Integer64Builder(); - return NANOARROW_OK; - } else { - *out = new ExtensionBuilder(ptype_sexp); - return NANOARROW_OK; + vector_type = VECTOR_TYPE_INTEGER64; } } // If we're here, these are non-S3 objects switch (TYPEOF(ptype_sexp)) { case LGLSXP: - *out = new LglBuilder(); - return NANOARROW_OK; + vector_type = VECTOR_TYPE_CHR; + break; case INTSXP: - *out = new IntBuilder(); - return NANOARROW_OK; + vector_type = VECTOR_TYPE_INT; + break; case REALSXP: - *out = new DblBuilder(); - return NANOARROW_OK; + vector_type = VECTOR_TYPE_DBL; + break; case STRSXP: - *out = new ChrBuilder(); - return NANOARROW_OK; - default: - *out = new ExtensionBuilder(ptype_sexp); - return NANOARROW_OK; + vector_type = VECTOR_TYPE_CHR; + break; } - *out = nullptr; - return ENOTSUP; + return InstantiateBuilderBase(schema, vector_type, ptype_sexp, options, out, error); } diff --git a/r/src/vctr_builder.h b/r/src/vctr_builder.h new file mode 100644 index 000000000..fe0b963cf --- /dev/null +++ b/r/src/vctr_builder.h @@ -0,0 +1,57 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef R_NANOARROW_VCTR_BUILDER_H_INCLUDED +#define R_NANOARROW_VCTR_BUILDER_H_INCLUDED + +#define R_NO_REMAP +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +// An opaque pointer to a C++-implemented collector that is instantiated +// using an ArrowSchema and (optionally) a SEXP ptype and is supplied +// zero or more ArrowArrays before computing the output. +struct VctrBuilder; + +// Options for when to use ALTREP. Currently ALTREP is only implemented +// for character() with exactly one input chunk. The default may eventually +// use some heuristics to decide if there is a likely performance advantage +// to deferring the conversion. +enum VctrBuilderUseAltrep { + VCTR_BUILDER_USE_ALTREP_DEFAULT = 0, + VCTR_BUILDER_USE_ALTREP_ALWAYS = 1, + VCTR_BUILDER_USE_ALTREP_NEVER = 2 +}; + +// Options controlling the details of how arrays are built. Note that +// this does not control the destination ptype: customizing ptype resolution +// is currently possible by passing a function to the `to` argument at the +// top level. Future additions could control the error/warning strategy +// for (potentially) lossy conversions. +struct VctrBuilderOptions { + enum VctrBuilderUseAltrep use_altrep; +}; + +#ifdef __cplusplus +} +#endif + +#endif From e1b6e42b883bdf729785882022f861e537c378f6 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 26 Feb 2024 17:53:41 -0400 Subject: [PATCH 05/36] a little bit more dispatch --- r/src/materialize_common.h | 1 + r/src/vctr_builder.cc | 49 +++++++++++++++++++++++--------------- 2 files changed, 31 insertions(+), 19 deletions(-) diff --git a/r/src/materialize_common.h b/r/src/materialize_common.h index 480b4393f..5c9b62d33 100644 --- a/r/src/materialize_common.h +++ b/r/src/materialize_common.h @@ -31,6 +31,7 @@ enum VectorType { VECTOR_TYPE_UNINITIALIZED, VECTOR_TYPE_NULL, + VECTOR_TYPE_RAW, VECTOR_TYPE_UNSPECIFIED, VECTOR_TYPE_LGL, VECTOR_TYPE_INT, diff --git a/r/src/vctr_builder.cc b/r/src/vctr_builder.cc index 999aa6381..845c3123d 100644 --- a/r/src/vctr_builder.cc +++ b/r/src/vctr_builder.cc @@ -64,6 +64,11 @@ struct VctrBuilder { virtual SEXP GetValue() { return R_NilValue; } }; +// Resolve a builder class from a schema and (optional) ptype and instantiate it +ArrowErrorCode InstantiateBuilder(const ArrowSchema* schema, SEXP ptype_sexp, + VctrBuilderOptions options, VctrBuilder** out, + ArrowError* error); + class IntBuilder : public VctrBuilder {}; class DblBuilder : public VctrBuilder {}; @@ -111,13 +116,13 @@ class ExtensionBuilder : public VctrBuilder { extern "C" enum VectorType nanoarrow_infer_vector_type(enum ArrowType type); extern "C" SEXP nanoarrow_c_infer_ptype(SEXP schema_xptr); +// A base method for when we already have the VectorType and have already +// resolved the ptype_sexp (if needed). static ArrowErrorCode InstantiateBuilderBase(const ArrowSchema* schema, VectorType vector_type, SEXP ptype_sexp, VctrBuilderOptions options, VctrBuilder** out, ArrowError* error) { switch (vector_type) { - case VECTOR_TYPE_NULL: - case VECTOR_TYPE_LGL: *out = new LglBuilder(); return NANOARROW_OK; @@ -165,7 +170,8 @@ static ArrowErrorCode InstantiateBuilderBase(const ArrowSchema* schema, } } -// Resolve a builder class +// A version of the above but for when don't know the VectorType yet and +// for when we're not sure if we need to pop into R to infer a ptype. ArrowErrorCode InstantiateBuilder(const ArrowSchema* schema, SEXP ptype_sexp, VctrBuilderOptions options, VctrBuilder** out, ArrowError* error) { @@ -206,6 +212,8 @@ ArrowErrorCode InstantiateBuilder(const ArrowSchema* schema, SEXP ptype_sexp, vector_type = VECTOR_TYPE_DATA_FRAME; } else if (Rf_inherits(ptype_sexp, "vctrs_unspecified")) { vector_type = VECTOR_TYPE_UNSPECIFIED; + } else if (Rf_inherits(ptype_sexp, "vctrs_list_of")) { + vector_type = VECTOR_TYPE_LIST_OF; } else if (Rf_inherits(ptype_sexp, "blob")) { vector_type = VECTOR_TYPE_BLOB; } else if (Rf_inherits(ptype_sexp, "Date")) { @@ -219,22 +227,25 @@ ArrowErrorCode InstantiateBuilder(const ArrowSchema* schema, SEXP ptype_sexp, } else if (Rf_inherits(ptype_sexp, "integer64")) { vector_type = VECTOR_TYPE_INTEGER64; } - } - - // If we're here, these are non-S3 objects - switch (TYPEOF(ptype_sexp)) { - case LGLSXP: - vector_type = VECTOR_TYPE_CHR; - break; - case INTSXP: - vector_type = VECTOR_TYPE_INT; - break; - case REALSXP: - vector_type = VECTOR_TYPE_DBL; - break; - case STRSXP: - vector_type = VECTOR_TYPE_CHR; - break; + } else { + // If we're here, these are non-S3 objects + switch (TYPEOF(ptype_sexp)) { + case RAWSXP: + vector_type = VECTOR_TYPE_RAW; + break; + case LGLSXP: + vector_type = VECTOR_TYPE_CHR; + break; + case INTSXP: + vector_type = VECTOR_TYPE_INT; + break; + case REALSXP: + vector_type = VECTOR_TYPE_DBL; + break; + case STRSXP: + vector_type = VECTOR_TYPE_CHR; + break; + } } return InstantiateBuilderBase(schema, vector_type, ptype_sexp, options, out, error); From b1e5d9046c1891a1f84806de07fe4578f349ab31 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 26 Feb 2024 20:49:12 -0400 Subject: [PATCH 06/36] maybe fix compile --- r/src/preserve.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/r/src/preserve.h b/r/src/preserve.h index 7c049d968..aeaed11d7 100644 --- a/r/src/preserve.h +++ b/r/src/preserve.h @@ -22,6 +22,8 @@ #include #include +#include + #ifdef __cplusplus extern "C" { #endif From 2bec5d3b42815174a7ccc689e620777aeede7f34 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 27 Feb 2024 21:38:28 -0400 Subject: [PATCH 07/36] some dispatch --- r/src/init.c | 3 + r/src/vctr_builder.cc | 167 ++++++++++++++++++++++++++++++------- r/src/vctr_builder.h | 4 + r/tools/make-callentries.R | 2 +- 4 files changed, 146 insertions(+), 30 deletions(-) diff --git a/r/src/init.c b/r/src/init.c index 417da0f8e..6544c7b9a 100644 --- a/r/src/init.c +++ b/r/src/init.c @@ -95,6 +95,7 @@ extern SEXP nanoarrow_c_preserve_and_release_on_other_thread(SEXP obj); extern SEXP nanoarrow_c_vctr_chunk_offsets(SEXP array_list); extern SEXP nanoarrow_c_vctr_chunk_resolve(SEXP indices_sexp, SEXP offsets_sexp); extern SEXP nanoarrow_c_vctr_as_slice(SEXP indices_sexp); +extern SEXP nanoarrow_c_infer_ptype_using_builder(SEXP schema_xptr, SEXP ptype_sexp); extern SEXP nanoarrow_c_version(void); extern SEXP nanoarrow_c_version_runtime(void); @@ -172,6 +173,8 @@ static const R_CallMethodDef CallEntries[] = { {"nanoarrow_c_vctr_chunk_offsets", (DL_FUNC)&nanoarrow_c_vctr_chunk_offsets, 1}, {"nanoarrow_c_vctr_chunk_resolve", (DL_FUNC)&nanoarrow_c_vctr_chunk_resolve, 2}, {"nanoarrow_c_vctr_as_slice", (DL_FUNC)&nanoarrow_c_vctr_as_slice, 1}, + {"nanoarrow_c_infer_ptype_using_builder", + (DL_FUNC)&nanoarrow_c_infer_ptype_using_builder, 2}, {"nanoarrow_c_version", (DL_FUNC)&nanoarrow_c_version, 0}, {"nanoarrow_c_version_runtime", (DL_FUNC)&nanoarrow_c_version_runtime, 0}, {NULL, NULL, 0}}; diff --git a/r/src/vctr_builder.cc b/r/src/vctr_builder.cc index 845c3123d..fabf01b7f 100644 --- a/r/src/vctr_builder.cc +++ b/r/src/vctr_builder.cc @@ -22,17 +22,28 @@ #include "materialize.h" #include "nanoarrow.h" #include "nanoarrow/r.h" +#include "preserve.h" #include "vctr_builder.h" struct VctrBuilder { public: - // If a ptype is supplied to a VctrBuilder, it must be supplied at construction - // and preserved until the value is no longer needed. This is not an appropriate - // time to error. + // VctrBuilder instances are always created from a vector_type or a ptype. + // InstantiateBuilder() takes care of picking which subclass. The base class + // constructor takes these two arguments to provide consumer implementations + // for inspecting their value. This does not validate any ptypes (that would + // happen in Init() if needed). + VctrBuilder(VectorType vector_type, SEXP ptype_sexp) + : vector_type_(vector_type), ptype_sexp_(R_NilValue), value_(R_NilValue) { + nanoarrow_preserve_sexp(ptype_sexp); + ptype_sexp_ = ptype_sexp; + } - // Enable generic containers like std::vector> - virtual ~VctrBuilder() {} + // Enable generic containers like std::unique_ptr + virtual ~VctrBuilder() { + nanoarrow_release_sexp(ptype_sexp_); + nanoarrow_release_sexp(value_); + } // Initialize this instance with the information available to the resolver, or the // information that was inferred. If using the default `to`, ptype may be R_NilValue @@ -61,7 +72,25 @@ struct VctrBuilder { virtual ArrowErrorCode Finish(ArrowError* error) { return NANOARROW_OK; } // Release the final value of the builder. Calling this method may longjmp. - virtual SEXP GetValue() { return R_NilValue; } + virtual SEXP GetValue() { + nanoarrow_release_sexp(value_); + value_ = R_NilValue; + return value_; + } + + // Get (or allocate if required) the SEXP ptype for this output + virtual SEXP GetPtype() { + if (ptype_sexp_ == R_NilValue) { + return nanoarrow_alloc_type(vector_type_, 0); + } else { + return ptype_sexp_; + } + } + + protected: + VectorType vector_type_; + SEXP ptype_sexp_; + SEXP value_; }; // Resolve a builder class from a schema and (optional) ptype and instantiate it @@ -69,47 +98,82 @@ ArrowErrorCode InstantiateBuilder(const ArrowSchema* schema, SEXP ptype_sexp, VctrBuilderOptions options, VctrBuilder** out, ArrowError* error); -class IntBuilder : public VctrBuilder {}; +class UnspecifiedBuilder : public VctrBuilder { + public: + explicit UnspecifiedBuilder() : VctrBuilder(VECTOR_TYPE_UNSPECIFIED, R_NilValue) {} +}; -class DblBuilder : public VctrBuilder {}; +class IntBuilder : public VctrBuilder { + public: + explicit IntBuilder() : VctrBuilder(VECTOR_TYPE_INT, R_NilValue) {} +}; -class ChrBuilder : public VctrBuilder {}; +class DblBuilder : public VctrBuilder { + public: + explicit DblBuilder() : VctrBuilder(VECTOR_TYPE_DBL, R_NilValue) {} +}; -class LglBuilder : public VctrBuilder {}; +class LglBuilder : public VctrBuilder { + public: + explicit LglBuilder() : VctrBuilder(VECTOR_TYPE_LGL, R_NilValue) {} +}; -class RcrdBuilder : public VctrBuilder { +class Integer64Builder : public VctrBuilder { public: - explicit RcrdBuilder(SEXP ptype_sexp) {} + explicit Integer64Builder() : VctrBuilder(VECTOR_TYPE_INTEGER64, R_NilValue) {} }; -class UnspecifiedBuilder : public VctrBuilder {}; +class ChrBuilder : public VctrBuilder { + public: + explicit ChrBuilder() + : VctrBuilder(VECTOR_TYPE_CHR, R_NilValue), + use_altrep_(VCTR_BUILDER_USE_ALTREP_DEFAULT) {} -class BlobBuilder : public VctrBuilder {}; + VctrBuilderUseAltrep use_altrep_; +}; -class ListOfBuilder : public VctrBuilder { +class BlobBuilder : public VctrBuilder { public: - explicit ListOfBuilder(SEXP ptype_sexp) {} + explicit BlobBuilder() : VctrBuilder(VECTOR_TYPE_BLOB, R_NilValue) {} }; -class DateBuilder : public VctrBuilder {}; +class DateBuilder : public VctrBuilder { + public: + explicit DateBuilder() : VctrBuilder(VECTOR_TYPE_DATE, R_NilValue) {} +}; -class HmsBuilder : public VctrBuilder {}; +class HmsBuilder : public VctrBuilder { + public: + explicit HmsBuilder() : VctrBuilder(VECTOR_TYPE_HMS, R_NilValue) {} +}; class PosixctBuilder : public VctrBuilder { public: - explicit PosixctBuilder(SEXP ptype_sexp) {} + explicit PosixctBuilder(SEXP ptype_sexp) + : VctrBuilder(VECTOR_TYPE_POSIXCT, ptype_sexp) {} }; class DifftimeBuilder : public VctrBuilder { public: - explicit DifftimeBuilder(SEXP ptype_sexp) {} + explicit DifftimeBuilder(SEXP ptype_sexp) + : VctrBuilder(VECTOR_TYPE_DIFFTIME, ptype_sexp) {} }; -class Integer64Builder : public VctrBuilder {}; +class OtherBuilder : public VctrBuilder { + public: + explicit OtherBuilder(SEXP ptype_sexp) : VctrBuilder(VECTOR_TYPE_OTHER, ptype_sexp) {} +}; + +class ListOfBuilder : public VctrBuilder { + public: + explicit ListOfBuilder(SEXP ptype_sexp) + : VctrBuilder(VECTOR_TYPE_LIST_OF, ptype_sexp) {} +}; -class ExtensionBuilder : public VctrBuilder { +class RcrdBuilder : public VctrBuilder { public: - explicit ExtensionBuilder(SEXP ptype_sexp) {} + explicit RcrdBuilder(SEXP ptype_sexp) + : VctrBuilder(VECTOR_TYPE_DATA_FRAME, ptype_sexp) {} }; // Currently in infer_ptype.c @@ -120,7 +184,6 @@ extern "C" SEXP nanoarrow_c_infer_ptype(SEXP schema_xptr); // resolved the ptype_sexp (if needed). static ArrowErrorCode InstantiateBuilderBase(const ArrowSchema* schema, VectorType vector_type, SEXP ptype_sexp, - VctrBuilderOptions options, VctrBuilder** out, ArrowError* error) { switch (vector_type) { case VECTOR_TYPE_LGL: @@ -133,7 +196,7 @@ static ArrowErrorCode InstantiateBuilderBase(const ArrowSchema* schema, *out = new DblBuilder(); return NANOARROW_OK; case VECTOR_TYPE_CHR: - *out = new LglBuilder(); + *out = new ChrBuilder(); return NANOARROW_OK; case VECTOR_TYPE_DATA_FRAME: *out = new RcrdBuilder(ptype_sexp); @@ -163,7 +226,7 @@ static ArrowErrorCode InstantiateBuilderBase(const ArrowSchema* schema, *out = new Integer64Builder(); return NANOARROW_OK; case VECTOR_TYPE_OTHER: - *out = new ExtensionBuilder(ptype_sexp); + *out = new OtherBuilder(ptype_sexp); return NANOARROW_OK; default: Rf_error("Unknown vector type id: %d", (int)vector_type); @@ -187,8 +250,7 @@ ArrowErrorCode InstantiateBuilder(const ArrowSchema* schema, SEXP ptype_sexp, case VECTOR_TYPE_DBL: case VECTOR_TYPE_CHR: case VECTOR_TYPE_DATA_FRAME: - return InstantiateBuilderBase(schema, vector_type, R_NilValue, options, out, - error); + return InstantiateBuilderBase(schema, vector_type, R_NilValue, out, error); default: break; } @@ -248,5 +310,52 @@ ArrowErrorCode InstantiateBuilder(const ArrowSchema* schema, SEXP ptype_sexp, } } - return InstantiateBuilderBase(schema, vector_type, ptype_sexp, options, out, error); + return InstantiateBuilderBase(schema, vector_type, ptype_sexp, out, error); +} + +// C API so that we can reuse these implementations elsewhere + +static void finalize_vctr_builder_xptr(SEXP vctr_builder_xptr) { + auto ptr = reinterpret_cast(R_ExternalPtrAddr(vctr_builder_xptr)); + if (ptr != nullptr) { + delete ptr; + } +} + +SEXP nanoarrow_vctr_builder_init(SEXP schema_xptr, SEXP ptype_sexp) { + struct ArrowSchema* schema = nanoarrow_schema_from_xptr(schema_xptr); + ArrowError error; + ArrowErrorInit(&error); + + // For now, no configurable options + VctrBuilderOptions options; + options.use_altrep = VCTR_BUILDER_USE_ALTREP_DEFAULT; + + // Wrap in an external pointer + SEXP vctr_builder_xptr = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue)); + R_RegisterCFinalizer(vctr_builder_xptr, &finalize_vctr_builder_xptr); + + // Instantiate the builder + VctrBuilder* vctr_builder = nullptr; + int code = InstantiateBuilder(schema, ptype_sexp, options, &vctr_builder, &error); + if (code != NANOARROW_OK) { + Rf_error("Failed to instantiate VctrBuilder: %s", error.message); + } + + R_SetExternalPtrAddr(vctr_builder_xptr, vctr_builder); + + // Initialize + code = vctr_builder->Init(schema, options, &error); + if (code != NANOARROW_OK) { + Rf_error("Failed to initialize VctrBuilder: %s", error.message); + } + + UNPROTECT(1); + return vctr_builder_xptr; +} + +SEXP nanoarrow_c_infer_ptype_using_builder(SEXP schema_xptr, SEXP ptype_sexp) { + SEXP vctr_bulider_xptr = PROTECT(nanoarrow_vctr_builder_init(schema_xptr, ptype_sexp)); + auto vctr_builder = reinterpret_cast(vctr_bulider_xptr); + return vctr_builder->GetPtype(); } diff --git a/r/src/vctr_builder.h b/r/src/vctr_builder.h index fe0b963cf..58bb56ace 100644 --- a/r/src/vctr_builder.h +++ b/r/src/vctr_builder.h @@ -50,6 +50,10 @@ struct VctrBuilderOptions { enum VctrBuilderUseAltrep use_altrep; }; +SEXP nanoarrow_vctr_builder_init(SEXP schema_xptr, SEXP ptype_sexp); + +SEXP nanoarrow_c_infer_ptype_using_builder(SEXP schema_xptr, SEXP ptype_sexp); + #ifdef __cplusplus } #endif diff --git a/r/tools/make-callentries.R b/r/tools/make-callentries.R index 403169c3c..552ff05fb 100644 --- a/r/tools/make-callentries.R +++ b/r/tools/make-callentries.R @@ -21,7 +21,7 @@ library(tidyverse) -src_files <- list.files("src", "\\.(c|cpp)$", full.names = TRUE) %>% +src_files <- list.files("src", "\\.(c|cc)$", full.names = TRUE) %>% setdiff("src/init.c") src_sources <- src_files %>% set_names() %>% map_chr(readr::read_file) From bbf46d7bd34366bb01c66e1cafa8311cd1acc5c7 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 27 Feb 2024 21:56:18 -0400 Subject: [PATCH 08/36] start migrating infer_ptype --- r/R/infer-ptype.R | 2 +- r/src/init.c | 3 ++- r/src/vctr_builder.cc | 38 +++++++++++++++++++++++++++++++------- r/src/vctr_builder.h | 2 +- 4 files changed, 35 insertions(+), 10 deletions(-) diff --git a/r/R/infer-ptype.R b/r/R/infer-ptype.R index ce3c7165f..e52a03fbe 100644 --- a/r/R/infer-ptype.R +++ b/r/R/infer-ptype.R @@ -56,7 +56,7 @@ infer_nanoarrow_ptype <- function(x) { stop("`x` must be a nanoarrow_schema(), nanoarrow_array(), or nanoarrow_array_stream()") } - .Call(nanoarrow_c_infer_ptype, x) + .Call(nanoarrow_c_infer_ptype_using_builder, x) } # This is called from C from nanoarrow_c_infer_ptype when all the C conversions diff --git a/r/src/init.c b/r/src/init.c index 6544c7b9a..2e5c07967 100644 --- a/r/src/init.c +++ b/r/src/init.c @@ -96,6 +96,7 @@ extern SEXP nanoarrow_c_vctr_chunk_offsets(SEXP array_list); extern SEXP nanoarrow_c_vctr_chunk_resolve(SEXP indices_sexp, SEXP offsets_sexp); extern SEXP nanoarrow_c_vctr_as_slice(SEXP indices_sexp); extern SEXP nanoarrow_c_infer_ptype_using_builder(SEXP schema_xptr, SEXP ptype_sexp); +extern SEXP nanoarrow_c_infer_ptype_using_builder(SEXP schema_xptr); extern SEXP nanoarrow_c_version(void); extern SEXP nanoarrow_c_version_runtime(void); @@ -174,7 +175,7 @@ static const R_CallMethodDef CallEntries[] = { {"nanoarrow_c_vctr_chunk_resolve", (DL_FUNC)&nanoarrow_c_vctr_chunk_resolve, 2}, {"nanoarrow_c_vctr_as_slice", (DL_FUNC)&nanoarrow_c_vctr_as_slice, 1}, {"nanoarrow_c_infer_ptype_using_builder", - (DL_FUNC)&nanoarrow_c_infer_ptype_using_builder, 2}, + (DL_FUNC)&nanoarrow_c_infer_ptype_using_builder, 1}, {"nanoarrow_c_version", (DL_FUNC)&nanoarrow_c_version, 0}, {"nanoarrow_c_version_runtime", (DL_FUNC)&nanoarrow_c_version_runtime, 0}, {NULL, NULL, 0}}; diff --git a/r/src/vctr_builder.cc b/r/src/vctr_builder.cc index fabf01b7f..6742895a0 100644 --- a/r/src/vctr_builder.cc +++ b/r/src/vctr_builder.cc @@ -50,7 +50,7 @@ struct VctrBuilder { // with Options containing the inferred information. Calling this method may longjmp. virtual ArrowErrorCode Init(const ArrowSchema* schema, VctrBuilderOptions options, ArrowError* error) { - return ENOTSUP; + return NANOARROW_OK; } // Push an array into this builder and do not take ownership of array. This is @@ -178,7 +178,21 @@ class RcrdBuilder : public VctrBuilder { // Currently in infer_ptype.c extern "C" enum VectorType nanoarrow_infer_vector_type(enum ArrowType type); -extern "C" SEXP nanoarrow_c_infer_ptype(SEXP schema_xptr); + +// Call nanoarrow::infer_ptype_other(), which handles less common types that +// are easier to compute in R or gives an informative error if this is +// not possible. +static SEXP call_infer_ptype_other(const ArrowSchema* schema) { + SEXP schema_xptr = PROTECT( + R_MakeExternalPtr(const_cast(schema), R_NilValue, R_NilValue)); + Rf_setAttrib(schema_xptr, R_ClassSymbol, nanoarrow_cls_schema); + + SEXP fun = PROTECT(Rf_install("infer_ptype_other")); + SEXP call = PROTECT(Rf_lang2(fun, schema_xptr)); + SEXP result = PROTECT(Rf_eval(call, nanoarrow_ns_pkg)); + UNPROTECT(4); + return result; +} // A base method for when we already have the VectorType and have already // resolved the ptype_sexp (if needed). @@ -260,7 +274,14 @@ ArrowErrorCode InstantiateBuilder(const ArrowSchema* schema, SEXP ptype_sexp, SEXP schema_xptr = PROTECT( R_MakeExternalPtr(const_cast(schema), R_NilValue, R_NilValue)); Rf_setAttrib(schema_xptr, R_ClassSymbol, nanoarrow_cls_schema); - SEXP inferred_ptype_sexp = PROTECT(nanoarrow_c_infer_ptype(schema_xptr)); + SEXP inferred_ptype_sexp = PROTECT(call_infer_ptype_other(schema)); + + // Error if it returns null, since this would put us in an infinite loop + if (inferred_ptype_sexp == R_NilValue) { + ArrowErrorSet(error, "infer_nanoarrow_ptype() returned NULL"); + return EINVAL; + } + int code = InstantiateBuilder(schema, inferred_ptype_sexp, options, out, error); UNPROTECT(1); return code; @@ -354,8 +375,11 @@ SEXP nanoarrow_vctr_builder_init(SEXP schema_xptr, SEXP ptype_sexp) { return vctr_builder_xptr; } -SEXP nanoarrow_c_infer_ptype_using_builder(SEXP schema_xptr, SEXP ptype_sexp) { - SEXP vctr_bulider_xptr = PROTECT(nanoarrow_vctr_builder_init(schema_xptr, ptype_sexp)); - auto vctr_builder = reinterpret_cast(vctr_bulider_xptr); - return vctr_builder->GetPtype(); +SEXP nanoarrow_c_infer_ptype_using_builder(SEXP schema_xptr) { + SEXP vctr_bulider_xptr = PROTECT(nanoarrow_vctr_builder_init(schema_xptr, R_NilValue)); + auto vctr_builder = + reinterpret_cast(R_ExternalPtrAddr(vctr_bulider_xptr)); + SEXP ptype_sexp = PROTECT(vctr_builder->GetPtype()); + UNPROTECT(2); + return ptype_sexp; } diff --git a/r/src/vctr_builder.h b/r/src/vctr_builder.h index 58bb56ace..823dd4d0b 100644 --- a/r/src/vctr_builder.h +++ b/r/src/vctr_builder.h @@ -52,7 +52,7 @@ struct VctrBuilderOptions { SEXP nanoarrow_vctr_builder_init(SEXP schema_xptr, SEXP ptype_sexp); -SEXP nanoarrow_c_infer_ptype_using_builder(SEXP schema_xptr, SEXP ptype_sexp); +SEXP nanoarrow_c_infer_ptype_using_builder(SEXP schema_xptr); #ifdef __cplusplus } From 3de01c7749468d6b433615956e715de74e9fd7c0 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 27 Feb 2024 22:16:15 -0400 Subject: [PATCH 09/36] with data.frame --- r/src/vctr_builder.cc | 58 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/r/src/vctr_builder.cc b/r/src/vctr_builder.cc index 6742895a0..f7b4dd765 100644 --- a/r/src/vctr_builder.cc +++ b/r/src/vctr_builder.cc @@ -19,6 +19,9 @@ #include #include +#include +#include + #include "materialize.h" #include "nanoarrow.h" #include "nanoarrow/r.h" @@ -174,6 +177,61 @@ class RcrdBuilder : public VctrBuilder { public: explicit RcrdBuilder(SEXP ptype_sexp) : VctrBuilder(VECTOR_TYPE_DATA_FRAME, ptype_sexp) {} + + ArrowErrorCode Init(const ArrowSchema* schema, VctrBuilderOptions options, + ArrowError* error) override { + // Check can convert here + + // Instantiate and initialize children + children_.resize(schema->n_children); + for (int64_t i = 0; i < schema->n_children; i++) { + SEXP child_ptype_sexp; + if (ptype_sexp_ != R_NilValue) { + child_ptype_sexp = VECTOR_ELT(ptype_sexp_, i); + } else { + child_ptype_sexp = R_NilValue; + } + + VctrBuilder* child = nullptr; + NANOARROW_RETURN_NOT_OK(InstantiateBuilder(schema->children[i], child_ptype_sexp, + options, &child, error)); + children_[i].reset(child); + NANOARROW_RETURN_NOT_OK(child->Init(schema->children[i], options, error)); + } + + schema_ = schema; + return NANOARROW_OK; + } + + SEXP GetPtype() override { + if (ptype_sexp_ != R_NilValue) { + return ptype_sexp_; + } + + SEXP result = PROTECT(Rf_allocVector(VECSXP, schema_->n_children)); + SEXP result_names = PROTECT(Rf_allocVector(STRSXP, schema_->n_children)); + for (R_xlen_t i = 0; i < schema_->n_children; i++) { + struct ArrowSchema* child = schema_->children[i]; + if (child->name != NULL) { + SET_STRING_ELT(result_names, i, Rf_mkCharCE(child->name, CE_UTF8)); + } else { + SET_STRING_ELT(result_names, i, Rf_mkChar("")); + } + } + + Rf_setAttrib(result, R_ClassSymbol, nanoarrow_cls_data_frame); + Rf_setAttrib(result, R_NamesSymbol, result_names); + SEXP rownames = PROTECT(Rf_allocVector(INTSXP, 2)); + INTEGER(rownames)[0] = NA_INTEGER; + INTEGER(rownames)[1] = 0; + Rf_setAttrib(result, R_RowNamesSymbol, rownames); + UNPROTECT(3); + return result; + } + + private: + const ArrowSchema* schema_; + std::vector> children_; }; // Currently in infer_ptype.c From d28efa7cea57db5436ad0cf2f342e1869fde7685 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 27 Feb 2024 22:51:42 -0400 Subject: [PATCH 10/36] with passing tests for infer --- r/src/vctr_builder.cc | 126 +++++++++++++++++++++++------------------- 1 file changed, 70 insertions(+), 56 deletions(-) diff --git a/r/src/vctr_builder.cc b/r/src/vctr_builder.cc index f7b4dd765..5b0021811 100644 --- a/r/src/vctr_builder.cc +++ b/r/src/vctr_builder.cc @@ -29,6 +29,24 @@ #include "vctr_builder.h" +// Call nanoarrow::infer_ptype_other(), which handles less common types that +// are easier to compute in R or gives an informative error if this is +// not possible. +static SEXP call_infer_ptype_other(const ArrowSchema* schema) { + SEXP schema_xptr = PROTECT( + R_MakeExternalPtr(const_cast(schema), R_NilValue, R_NilValue)); + Rf_setAttrib(schema_xptr, R_ClassSymbol, nanoarrow_cls_schema); + + SEXP fun = PROTECT(Rf_install("infer_ptype_other")); + SEXP call = PROTECT(Rf_lang2(fun, schema_xptr)); + SEXP result = PROTECT(Rf_eval(call, nanoarrow_ns_pkg)); + UNPROTECT(4); + return result; +} + +// Currently in infer_ptype.c +extern "C" enum VectorType nanoarrow_infer_vector_type(enum ArrowType type); + struct VctrBuilder { public: // VctrBuilder instances are always created from a vector_type or a ptype. @@ -53,6 +71,7 @@ struct VctrBuilder { // with Options containing the inferred information. Calling this method may longjmp. virtual ArrowErrorCode Init(const ArrowSchema* schema, VctrBuilderOptions options, ArrowError* error) { + schema_ = schema; return NANOARROW_OK; } @@ -83,17 +102,23 @@ struct VctrBuilder { // Get (or allocate if required) the SEXP ptype for this output virtual SEXP GetPtype() { - if (ptype_sexp_ == R_NilValue) { - return nanoarrow_alloc_type(vector_type_, 0); - } else { + if (ptype_sexp_ != R_NilValue) { return ptype_sexp_; } + + SEXP result = nanoarrow_alloc_type(vector_type_, 0); + if (result != R_NilValue) { + return result; + } + + return call_infer_ptype_other(schema_); } protected: VectorType vector_type_; SEXP ptype_sexp_; SEXP value_; + const ArrowSchema* schema_; }; // Resolve a builder class from a schema and (optional) ptype and instantiate it @@ -103,33 +128,35 @@ ArrowErrorCode InstantiateBuilder(const ArrowSchema* schema, SEXP ptype_sexp, class UnspecifiedBuilder : public VctrBuilder { public: - explicit UnspecifiedBuilder() : VctrBuilder(VECTOR_TYPE_UNSPECIFIED, R_NilValue) {} + explicit UnspecifiedBuilder(SEXP ptype_sexp) + : VctrBuilder(VECTOR_TYPE_UNSPECIFIED, ptype_sexp) {} }; class IntBuilder : public VctrBuilder { public: - explicit IntBuilder() : VctrBuilder(VECTOR_TYPE_INT, R_NilValue) {} + explicit IntBuilder(SEXP ptype_sexp) : VctrBuilder(VECTOR_TYPE_INT, ptype_sexp) {} }; class DblBuilder : public VctrBuilder { public: - explicit DblBuilder() : VctrBuilder(VECTOR_TYPE_DBL, R_NilValue) {} + explicit DblBuilder(SEXP ptype_sexp) : VctrBuilder(VECTOR_TYPE_DBL, ptype_sexp) {} }; class LglBuilder : public VctrBuilder { public: - explicit LglBuilder() : VctrBuilder(VECTOR_TYPE_LGL, R_NilValue) {} + explicit LglBuilder(SEXP ptype_sexp) : VctrBuilder(VECTOR_TYPE_LGL, ptype_sexp) {} }; class Integer64Builder : public VctrBuilder { public: - explicit Integer64Builder() : VctrBuilder(VECTOR_TYPE_INTEGER64, R_NilValue) {} + explicit Integer64Builder(SEXP ptype_sexp) + : VctrBuilder(VECTOR_TYPE_INTEGER64, ptype_sexp) {} }; class ChrBuilder : public VctrBuilder { public: - explicit ChrBuilder() - : VctrBuilder(VECTOR_TYPE_CHR, R_NilValue), + explicit ChrBuilder(SEXP ptype_sexp) + : VctrBuilder(VECTOR_TYPE_CHR, ptype_sexp), use_altrep_(VCTR_BUILDER_USE_ALTREP_DEFAULT) {} VctrBuilderUseAltrep use_altrep_; @@ -137,17 +164,17 @@ class ChrBuilder : public VctrBuilder { class BlobBuilder : public VctrBuilder { public: - explicit BlobBuilder() : VctrBuilder(VECTOR_TYPE_BLOB, R_NilValue) {} + explicit BlobBuilder(SEXP ptype_sexp) : VctrBuilder(VECTOR_TYPE_BLOB, ptype_sexp) {} }; class DateBuilder : public VctrBuilder { public: - explicit DateBuilder() : VctrBuilder(VECTOR_TYPE_DATE, R_NilValue) {} + explicit DateBuilder(SEXP ptype_sexp) : VctrBuilder(VECTOR_TYPE_DATE, ptype_sexp) {} }; class HmsBuilder : public VctrBuilder { public: - explicit HmsBuilder() : VctrBuilder(VECTOR_TYPE_HMS, R_NilValue) {} + explicit HmsBuilder(SEXP ptype_sexp) : VctrBuilder(VECTOR_TYPE_HMS, ptype_sexp) {} }; class PosixctBuilder : public VctrBuilder { @@ -180,7 +207,9 @@ class RcrdBuilder : public VctrBuilder { ArrowErrorCode Init(const ArrowSchema* schema, VctrBuilderOptions options, ArrowError* error) override { - // Check can convert here + NANOARROW_RETURN_NOT_OK(VctrBuilder::Init(schema, options, error)); + + // TODO: Check can convert here // Instantiate and initialize children children_.resize(schema->n_children); @@ -217,6 +246,10 @@ class RcrdBuilder : public VctrBuilder { } else { SET_STRING_ELT(result_names, i, Rf_mkChar("")); } + + SEXP child_sexp = PROTECT(children_[i]->GetPtype()); + SET_VECTOR_ELT(result, i, child_sexp); + UNPROTECT(1); } Rf_setAttrib(result, R_ClassSymbol, nanoarrow_cls_data_frame); @@ -230,28 +263,9 @@ class RcrdBuilder : public VctrBuilder { } private: - const ArrowSchema* schema_; std::vector> children_; }; -// Currently in infer_ptype.c -extern "C" enum VectorType nanoarrow_infer_vector_type(enum ArrowType type); - -// Call nanoarrow::infer_ptype_other(), which handles less common types that -// are easier to compute in R or gives an informative error if this is -// not possible. -static SEXP call_infer_ptype_other(const ArrowSchema* schema) { - SEXP schema_xptr = PROTECT( - R_MakeExternalPtr(const_cast(schema), R_NilValue, R_NilValue)); - Rf_setAttrib(schema_xptr, R_ClassSymbol, nanoarrow_cls_schema); - - SEXP fun = PROTECT(Rf_install("infer_ptype_other")); - SEXP call = PROTECT(Rf_lang2(fun, schema_xptr)); - SEXP result = PROTECT(Rf_eval(call, nanoarrow_ns_pkg)); - UNPROTECT(4); - return result; -} - // A base method for when we already have the VectorType and have already // resolved the ptype_sexp (if needed). static ArrowErrorCode InstantiateBuilderBase(const ArrowSchema* schema, @@ -259,16 +273,16 @@ static ArrowErrorCode InstantiateBuilderBase(const ArrowSchema* schema, VctrBuilder** out, ArrowError* error) { switch (vector_type) { case VECTOR_TYPE_LGL: - *out = new LglBuilder(); + *out = new LglBuilder(ptype_sexp); return NANOARROW_OK; case VECTOR_TYPE_INT: - *out = new IntBuilder(); + *out = new IntBuilder(ptype_sexp); return NANOARROW_OK; case VECTOR_TYPE_DBL: - *out = new DblBuilder(); + *out = new DblBuilder(ptype_sexp); return NANOARROW_OK; case VECTOR_TYPE_CHR: - *out = new ChrBuilder(); + *out = new ChrBuilder(ptype_sexp); return NANOARROW_OK; case VECTOR_TYPE_DATA_FRAME: *out = new RcrdBuilder(ptype_sexp); @@ -277,16 +291,16 @@ static ArrowErrorCode InstantiateBuilderBase(const ArrowSchema* schema, *out = new ListOfBuilder(ptype_sexp); return NANOARROW_OK; case VECTOR_TYPE_UNSPECIFIED: - *out = new UnspecifiedBuilder(); + *out = new UnspecifiedBuilder(ptype_sexp); return NANOARROW_OK; case VECTOR_TYPE_BLOB: - *out = new BlobBuilder(); + *out = new BlobBuilder(ptype_sexp); return NANOARROW_OK; case VECTOR_TYPE_DATE: - *out = new DateBuilder(); + *out = new DateBuilder(ptype_sexp); return NANOARROW_OK; case VECTOR_TYPE_HMS: - *out = new HmsBuilder(); + *out = new HmsBuilder(ptype_sexp); return NANOARROW_OK; case VECTOR_TYPE_POSIXCT: *out = new PosixctBuilder(ptype_sexp); @@ -295,7 +309,7 @@ static ArrowErrorCode InstantiateBuilderBase(const ArrowSchema* schema, *out = new DifftimeBuilder(ptype_sexp); return NANOARROW_OK; case VECTOR_TYPE_INTEGER64: - *out = new Integer64Builder(); + *out = new Integer64Builder(ptype_sexp); return NANOARROW_OK; case VECTOR_TYPE_OTHER: *out = new OtherBuilder(ptype_sexp); @@ -315,23 +329,23 @@ ArrowErrorCode InstantiateBuilder(const ArrowSchema* schema, SEXP ptype_sexp, ArrowSchemaView view; NANOARROW_RETURN_NOT_OK(ArrowSchemaViewInit(&view, schema, error)); - enum VectorType vector_type = nanoarrow_infer_vector_type(view.type); - switch (vector_type) { - case VECTOR_TYPE_LGL: - case VECTOR_TYPE_INT: - case VECTOR_TYPE_DBL: - case VECTOR_TYPE_CHR: - case VECTOR_TYPE_DATA_FRAME: - return InstantiateBuilderBase(schema, vector_type, R_NilValue, out, error); - default: - break; + // Ensure extension types always go through infer_ptype_other() + if (view.extension_name.size_bytes == 0) { + enum VectorType vector_type = nanoarrow_infer_vector_type(view.type); + switch (vector_type) { + case VECTOR_TYPE_LGL: + case VECTOR_TYPE_INT: + case VECTOR_TYPE_DBL: + case VECTOR_TYPE_CHR: + case VECTOR_TYPE_DATA_FRAME: + return InstantiateBuilderBase(schema, vector_type, R_NilValue, out, error); + default: + break; + } } // Otherwise, resolve the ptype and use it (this will error for ptypes that can't be // resolved) - SEXP schema_xptr = PROTECT( - R_MakeExternalPtr(const_cast(schema), R_NilValue, R_NilValue)); - Rf_setAttrib(schema_xptr, R_ClassSymbol, nanoarrow_cls_schema); SEXP inferred_ptype_sexp = PROTECT(call_infer_ptype_other(schema)); // Error if it returns null, since this would put us in an infinite loop @@ -411,7 +425,7 @@ SEXP nanoarrow_vctr_builder_init(SEXP schema_xptr, SEXP ptype_sexp) { options.use_altrep = VCTR_BUILDER_USE_ALTREP_DEFAULT; // Wrap in an external pointer - SEXP vctr_builder_xptr = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue)); + SEXP vctr_builder_xptr = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, schema_xptr)); R_RegisterCFinalizer(vctr_builder_xptr, &finalize_vctr_builder_xptr); // Instantiate the builder From 42b1b852665ac1c60ea0e821528006ab833a64b0 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 27 Feb 2024 22:57:13 -0400 Subject: [PATCH 11/36] remove ptype bits --- r/R/infer-ptype.R | 2 +- r/src/infer_ptype.c | 68 ++----------------------------------------- r/src/vctr_builder.cc | 45 +++++++++++++++++++++++++--- r/src/vctr_builder.h | 2 +- 4 files changed, 45 insertions(+), 72 deletions(-) diff --git a/r/R/infer-ptype.R b/r/R/infer-ptype.R index e52a03fbe..ce3c7165f 100644 --- a/r/R/infer-ptype.R +++ b/r/R/infer-ptype.R @@ -56,7 +56,7 @@ infer_nanoarrow_ptype <- function(x) { stop("`x` must be a nanoarrow_schema(), nanoarrow_array(), or nanoarrow_array_stream()") } - .Call(nanoarrow_c_infer_ptype_using_builder, x) + .Call(nanoarrow_c_infer_ptype, x) } # This is called from C from nanoarrow_c_infer_ptype when all the C conversions diff --git a/r/src/infer_ptype.c b/r/src/infer_ptype.c index 1f5f8e042..0ffacf0df 100644 --- a/r/src/infer_ptype.c +++ b/r/src/infer_ptype.c @@ -34,7 +34,7 @@ // that later warns for out-of-range values (e.g., int64 to double()); // however, a user can use the convert_array(x, ptype = something_safer()) // when this occurs. -enum VectorType nanoarrow_infer_vector_type(enum ArrowType type) { +static enum VectorType nanoarrow_infer_vector_type(enum ArrowType type) { switch (type) { case NANOARROW_TYPE_BOOL: return VECTOR_TYPE_LGL; @@ -69,7 +69,7 @@ enum VectorType nanoarrow_infer_vector_type(enum ArrowType type) { } // The same as the above, but from a nanoarrow_schema() -enum VectorType nanoarrow_infer_vector_type_schema(SEXP schema_xptr) { +static enum VectorType nanoarrow_infer_vector_type_schema(SEXP schema_xptr) { struct ArrowSchema* schema = nanoarrow_schema_from_xptr(schema_xptr); struct ArrowSchemaView schema_view; @@ -89,67 +89,3 @@ enum VectorType nanoarrow_infer_vector_type_schema(SEXP schema_xptr) { enum VectorType nanoarrow_infer_vector_type_array(SEXP array_xptr) { return nanoarrow_infer_vector_type_schema(array_xptr_get_schema(array_xptr)); } - -// Call nanoarrow::infer_ptype_other(), which handles less common types that -// are easier to compute in R or gives an informative error if this is -// not possible. -static SEXP call_infer_ptype_other(SEXP schema_xptr) { - SEXP fun = PROTECT(Rf_install("infer_ptype_other")); - SEXP call = PROTECT(Rf_lang2(fun, schema_xptr)); - SEXP result = PROTECT(Rf_eval(call, nanoarrow_ns_pkg)); - UNPROTECT(3); - return result; -} - -SEXP nanoarrow_c_infer_ptype(SEXP schema_xptr); - -static SEXP infer_ptype_data_frame(SEXP schema_xptr) { - struct ArrowSchema* schema = nanoarrow_schema_from_xptr(schema_xptr); - SEXP result = PROTECT(Rf_allocVector(VECSXP, schema->n_children)); - SEXP result_names = PROTECT(Rf_allocVector(STRSXP, schema->n_children)); - - for (R_xlen_t i = 0; i < schema->n_children; i++) { - SEXP child_xptr = PROTECT(borrow_schema_child_xptr(schema_xptr, i)); - SET_VECTOR_ELT(result, i, nanoarrow_c_infer_ptype(child_xptr)); - UNPROTECT(1); - - struct ArrowSchema* child = schema->children[i]; - if (child->name != NULL) { - SET_STRING_ELT(result_names, i, Rf_mkCharCE(child->name, CE_UTF8)); - } else { - SET_STRING_ELT(result_names, i, Rf_mkChar("")); - } - } - - Rf_setAttrib(result, R_ClassSymbol, nanoarrow_cls_data_frame); - Rf_setAttrib(result, R_NamesSymbol, result_names); - SEXP rownames = PROTECT(Rf_allocVector(INTSXP, 2)); - INTEGER(rownames)[0] = NA_INTEGER; - INTEGER(rownames)[1] = 0; - Rf_setAttrib(result, R_RowNamesSymbol, rownames); - UNPROTECT(3); - return result; -} - -SEXP nanoarrow_c_infer_ptype(SEXP schema_xptr) { - enum VectorType vector_type = nanoarrow_infer_vector_type_schema(schema_xptr); - SEXP ptype = R_NilValue; - - switch (vector_type) { - case VECTOR_TYPE_LGL: - case VECTOR_TYPE_INT: - case VECTOR_TYPE_DBL: - case VECTOR_TYPE_CHR: - ptype = PROTECT(nanoarrow_alloc_type(vector_type, 0)); - break; - case VECTOR_TYPE_DATA_FRAME: - ptype = PROTECT(infer_ptype_data_frame(schema_xptr)); - break; - default: - ptype = PROTECT(call_infer_ptype_other(schema_xptr)); - break; - } - - UNPROTECT(1); - return ptype; -} diff --git a/r/src/vctr_builder.cc b/r/src/vctr_builder.cc index 5b0021811..97793d018 100644 --- a/r/src/vctr_builder.cc +++ b/r/src/vctr_builder.cc @@ -29,6 +29,46 @@ #include "vctr_builder.h" +// These conversions are the default R-native type guesses for +// an array that don't require extra information from the ptype (e.g., +// factor with levels). Some of these guesses may result in a conversion +// that later warns for out-of-range values (e.g., int64 to double()); +// however, a user can use the convert_array(x, ptype = something_safer()) +// when this occurs. +enum VectorType nanoarrow_infer_vector_type(enum ArrowType type) { + switch (type) { + case NANOARROW_TYPE_BOOL: + return VECTOR_TYPE_LGL; + + case NANOARROW_TYPE_INT8: + case NANOARROW_TYPE_UINT8: + case NANOARROW_TYPE_INT16: + case NANOARROW_TYPE_UINT16: + case NANOARROW_TYPE_INT32: + return VECTOR_TYPE_INT; + + case NANOARROW_TYPE_UINT32: + case NANOARROW_TYPE_INT64: + case NANOARROW_TYPE_UINT64: + case NANOARROW_TYPE_FLOAT: + case NANOARROW_TYPE_DOUBLE: + case NANOARROW_TYPE_DECIMAL128: + return VECTOR_TYPE_DBL; + + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_LARGE_STRING: + return VECTOR_TYPE_CHR; + + case NANOARROW_TYPE_DENSE_UNION: + case NANOARROW_TYPE_SPARSE_UNION: + case NANOARROW_TYPE_STRUCT: + return VECTOR_TYPE_DATA_FRAME; + + default: + return VECTOR_TYPE_OTHER; + } +} + // Call nanoarrow::infer_ptype_other(), which handles less common types that // are easier to compute in R or gives an informative error if this is // not possible. @@ -44,9 +84,6 @@ static SEXP call_infer_ptype_other(const ArrowSchema* schema) { return result; } -// Currently in infer_ptype.c -extern "C" enum VectorType nanoarrow_infer_vector_type(enum ArrowType type); - struct VctrBuilder { public: // VctrBuilder instances are always created from a vector_type or a ptype. @@ -447,7 +484,7 @@ SEXP nanoarrow_vctr_builder_init(SEXP schema_xptr, SEXP ptype_sexp) { return vctr_builder_xptr; } -SEXP nanoarrow_c_infer_ptype_using_builder(SEXP schema_xptr) { +SEXP nanoarrow_c_infer_ptype(SEXP schema_xptr) { SEXP vctr_bulider_xptr = PROTECT(nanoarrow_vctr_builder_init(schema_xptr, R_NilValue)); auto vctr_builder = reinterpret_cast(R_ExternalPtrAddr(vctr_bulider_xptr)); diff --git a/r/src/vctr_builder.h b/r/src/vctr_builder.h index 823dd4d0b..a037c9513 100644 --- a/r/src/vctr_builder.h +++ b/r/src/vctr_builder.h @@ -52,7 +52,7 @@ struct VctrBuilderOptions { SEXP nanoarrow_vctr_builder_init(SEXP schema_xptr, SEXP ptype_sexp); -SEXP nanoarrow_c_infer_ptype_using_builder(SEXP schema_xptr); +SEXP nanoarrow_c_infer_ptype(SEXP schema_xptr); #ifdef __cplusplus } From 15f2e177be8d9f67008ea2513082f37426858a18 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 27 Feb 2024 23:13:28 -0400 Subject: [PATCH 12/36] start to split out builder classes --- r/src/vctr_builder.cc | 85 +++++------------------------------ r/src/vctr_builder_base.h | 93 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 103 insertions(+), 75 deletions(-) create mode 100644 r/src/vctr_builder_base.h diff --git a/r/src/vctr_builder.cc b/r/src/vctr_builder.cc index 97793d018..ca95b9177 100644 --- a/r/src/vctr_builder.cc +++ b/r/src/vctr_builder.cc @@ -28,6 +28,7 @@ #include "preserve.h" #include "vctr_builder.h" +#include "vctr_builder_base.h" // These conversions are the default R-native type guesses for // an array that don't require extra information from the ptype (e.g., @@ -84,80 +85,6 @@ static SEXP call_infer_ptype_other(const ArrowSchema* schema) { return result; } -struct VctrBuilder { - public: - // VctrBuilder instances are always created from a vector_type or a ptype. - // InstantiateBuilder() takes care of picking which subclass. The base class - // constructor takes these two arguments to provide consumer implementations - // for inspecting their value. This does not validate any ptypes (that would - // happen in Init() if needed). - VctrBuilder(VectorType vector_type, SEXP ptype_sexp) - : vector_type_(vector_type), ptype_sexp_(R_NilValue), value_(R_NilValue) { - nanoarrow_preserve_sexp(ptype_sexp); - ptype_sexp_ = ptype_sexp; - } - - // Enable generic containers like std::unique_ptr - virtual ~VctrBuilder() { - nanoarrow_release_sexp(ptype_sexp_); - nanoarrow_release_sexp(value_); - } - - // Initialize this instance with the information available to the resolver, or the - // information that was inferred. If using the default `to`, ptype may be R_NilValue - // with Options containing the inferred information. Calling this method may longjmp. - virtual ArrowErrorCode Init(const ArrowSchema* schema, VctrBuilderOptions options, - ArrowError* error) { - schema_ = schema; - return NANOARROW_OK; - } - - // Push an array into this builder and do not take ownership of array. This is - // called when the caller cannot safely relinquish ownership of an array (e.g., - // convert_array()). Calling this method may longjmp. - virtual ArrowErrorCode PushNext(const ArrowArray* array, ArrowError* error) { - return ENOTSUP; - } - - // Push an array into this builder. The implementation may (but is not required) to take - // ownership. This is called when the caller can relinquish ownership (e.g., - // convert_array_stream()). Calling this method may longjmp. - virtual ArrowErrorCode PushNextOwning(ArrowArray* array, ArrowError* error) { - return PushNext(array, error); - } - - // Perform any final calculations required to calculate the return value. - // Calling this method may longjmp. - virtual ArrowErrorCode Finish(ArrowError* error) { return NANOARROW_OK; } - - // Release the final value of the builder. Calling this method may longjmp. - virtual SEXP GetValue() { - nanoarrow_release_sexp(value_); - value_ = R_NilValue; - return value_; - } - - // Get (or allocate if required) the SEXP ptype for this output - virtual SEXP GetPtype() { - if (ptype_sexp_ != R_NilValue) { - return ptype_sexp_; - } - - SEXP result = nanoarrow_alloc_type(vector_type_, 0); - if (result != R_NilValue) { - return result; - } - - return call_infer_ptype_other(schema_); - } - - protected: - VectorType vector_type_; - SEXP ptype_sexp_; - SEXP value_; - const ArrowSchema* schema_; -}; - // Resolve a builder class from a schema and (optional) ptype and instantiate it ArrowErrorCode InstantiateBuilder(const ArrowSchema* schema, SEXP ptype_sexp, VctrBuilderOptions options, VctrBuilder** out, @@ -172,16 +99,22 @@ class UnspecifiedBuilder : public VctrBuilder { class IntBuilder : public VctrBuilder { public: explicit IntBuilder(SEXP ptype_sexp) : VctrBuilder(VECTOR_TYPE_INT, ptype_sexp) {} + + SEXP GetPtype() override { return Rf_allocVector(INTSXP, 0); } }; class DblBuilder : public VctrBuilder { public: explicit DblBuilder(SEXP ptype_sexp) : VctrBuilder(VECTOR_TYPE_DBL, ptype_sexp) {} + + SEXP GetPtype() override { return Rf_allocVector(REALSXP, 0); } }; class LglBuilder : public VctrBuilder { public: explicit LglBuilder(SEXP ptype_sexp) : VctrBuilder(VECTOR_TYPE_LGL, ptype_sexp) {} + + SEXP GetPtype() override { return Rf_allocVector(LGLSXP, 0); } }; class Integer64Builder : public VctrBuilder { @@ -196,6 +129,8 @@ class ChrBuilder : public VctrBuilder { : VctrBuilder(VECTOR_TYPE_CHR, ptype_sexp), use_altrep_(VCTR_BUILDER_USE_ALTREP_DEFAULT) {} + SEXP GetPtype() override { return Rf_allocVector(STRSXP, 0); } + VctrBuilderUseAltrep use_altrep_; }; @@ -426,7 +361,7 @@ ArrowErrorCode InstantiateBuilder(const ArrowSchema* schema, SEXP ptype_sexp, vector_type = VECTOR_TYPE_RAW; break; case LGLSXP: - vector_type = VECTOR_TYPE_CHR; + vector_type = VECTOR_TYPE_LGL; break; case INTSXP: vector_type = VECTOR_TYPE_INT; diff --git a/r/src/vctr_builder_base.h b/r/src/vctr_builder_base.h new file mode 100644 index 000000000..0b4f05ddc --- /dev/null +++ b/r/src/vctr_builder_base.h @@ -0,0 +1,93 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef R_NANOARROW_VCTR_BUILDER_BASE_H_INCLUDED +#define R_NANOARROW_VCTR_BUILDER_BASE_H_INCLUDED + +#define R_NO_REMAP +#include +#include + +#include "materialize.h" +#include "nanoarrow.h" +#include "preserve.h" +#include "vctr_builder.h" + +struct VctrBuilder { + public: + // VctrBuilder instances are always created from a vector_type or a ptype. + // InstantiateBuilder() takes care of picking which subclass. The base class + // constructor takes these two arguments to provide consumer implementations + // for inspecting their value. This does not validate any ptypes (that would + // happen in Init() if needed). + VctrBuilder(VectorType vector_type, SEXP ptype_sexp) + : vector_type_(vector_type), ptype_sexp_(R_NilValue), value_(R_NilValue) { + nanoarrow_preserve_sexp(ptype_sexp); + ptype_sexp_ = ptype_sexp; + } + + // Enable generic containers like std::unique_ptr + virtual ~VctrBuilder() { + nanoarrow_release_sexp(ptype_sexp_); + nanoarrow_release_sexp(value_); + } + + // Initialize this instance with the information available to the resolver, or the + // information that was inferred. If using the default `to`, ptype may be R_NilValue + // with Options containing the inferred information. Calling this method may longjmp. + virtual ArrowErrorCode Init(const ArrowSchema* schema, VctrBuilderOptions options, + ArrowError* error) { + schema_ = schema; + return NANOARROW_OK; + } + + // Push an array into this builder and do not take ownership of array. This is + // called when the caller cannot safely relinquish ownership of an array (e.g., + // convert_array()). Calling this method may longjmp. + virtual ArrowErrorCode PushNext(const ArrowArray* array, ArrowError* error) { + return ENOTSUP; + } + + // Push an array into this builder. The implementation may (but is not required) to take + // ownership. This is called when the caller can relinquish ownership (e.g., + // convert_array_stream()). Calling this method may longjmp. + virtual ArrowErrorCode PushNextOwning(ArrowArray* array, ArrowError* error) { + return PushNext(array, error); + } + + // Perform any final calculations required to calculate the return value. + // Calling this method may longjmp. + virtual ArrowErrorCode Finish(ArrowError* error) { return NANOARROW_OK; } + + // Release the final value of the builder. Calling this method may longjmp. + virtual SEXP GetValue() { + nanoarrow_release_sexp(value_); + value_ = R_NilValue; + return value_; + } + + // Get (or allocate if required) the SEXP ptype for this output + virtual SEXP GetPtype() { return ptype_sexp_; } + + protected: + VectorType vector_type_; + SEXP ptype_sexp_; + SEXP value_; + const ArrowSchema* schema_; +}; + +#endif From 04f56cd8edc77c76bd35809f904255512853ae2a Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 27 Feb 2024 23:23:20 -0400 Subject: [PATCH 13/36] split out classes into smaller files --- r/src/vctr_builder.cc | 157 +-------------------------------- r/src/vctr_builder_base.h | 5 ++ r/src/vctr_builder_list_of.h | 33 +++++++ r/src/vctr_builder_primitive.h | 103 +++++++++++++++++++++ r/src/vctr_builder_rcrd.h | 96 ++++++++++++++++++++ 5 files changed, 240 insertions(+), 154 deletions(-) create mode 100644 r/src/vctr_builder_list_of.h create mode 100644 r/src/vctr_builder_primitive.h create mode 100644 r/src/vctr_builder_rcrd.h diff --git a/r/src/vctr_builder.cc b/r/src/vctr_builder.cc index ca95b9177..45c4a2111 100644 --- a/r/src/vctr_builder.cc +++ b/r/src/vctr_builder.cc @@ -25,10 +25,12 @@ #include "materialize.h" #include "nanoarrow.h" #include "nanoarrow/r.h" -#include "preserve.h" #include "vctr_builder.h" #include "vctr_builder_base.h" +#include "vctr_builder_primitive.h" +#include "vctr_builder_rcrd.h" +#include "vctr_builder_list_of.h" // These conversions are the default R-native type guesses for // an array that don't require extra information from the ptype (e.g., @@ -85,159 +87,6 @@ static SEXP call_infer_ptype_other(const ArrowSchema* schema) { return result; } -// Resolve a builder class from a schema and (optional) ptype and instantiate it -ArrowErrorCode InstantiateBuilder(const ArrowSchema* schema, SEXP ptype_sexp, - VctrBuilderOptions options, VctrBuilder** out, - ArrowError* error); - -class UnspecifiedBuilder : public VctrBuilder { - public: - explicit UnspecifiedBuilder(SEXP ptype_sexp) - : VctrBuilder(VECTOR_TYPE_UNSPECIFIED, ptype_sexp) {} -}; - -class IntBuilder : public VctrBuilder { - public: - explicit IntBuilder(SEXP ptype_sexp) : VctrBuilder(VECTOR_TYPE_INT, ptype_sexp) {} - - SEXP GetPtype() override { return Rf_allocVector(INTSXP, 0); } -}; - -class DblBuilder : public VctrBuilder { - public: - explicit DblBuilder(SEXP ptype_sexp) : VctrBuilder(VECTOR_TYPE_DBL, ptype_sexp) {} - - SEXP GetPtype() override { return Rf_allocVector(REALSXP, 0); } -}; - -class LglBuilder : public VctrBuilder { - public: - explicit LglBuilder(SEXP ptype_sexp) : VctrBuilder(VECTOR_TYPE_LGL, ptype_sexp) {} - - SEXP GetPtype() override { return Rf_allocVector(LGLSXP, 0); } -}; - -class Integer64Builder : public VctrBuilder { - public: - explicit Integer64Builder(SEXP ptype_sexp) - : VctrBuilder(VECTOR_TYPE_INTEGER64, ptype_sexp) {} -}; - -class ChrBuilder : public VctrBuilder { - public: - explicit ChrBuilder(SEXP ptype_sexp) - : VctrBuilder(VECTOR_TYPE_CHR, ptype_sexp), - use_altrep_(VCTR_BUILDER_USE_ALTREP_DEFAULT) {} - - SEXP GetPtype() override { return Rf_allocVector(STRSXP, 0); } - - VctrBuilderUseAltrep use_altrep_; -}; - -class BlobBuilder : public VctrBuilder { - public: - explicit BlobBuilder(SEXP ptype_sexp) : VctrBuilder(VECTOR_TYPE_BLOB, ptype_sexp) {} -}; - -class DateBuilder : public VctrBuilder { - public: - explicit DateBuilder(SEXP ptype_sexp) : VctrBuilder(VECTOR_TYPE_DATE, ptype_sexp) {} -}; - -class HmsBuilder : public VctrBuilder { - public: - explicit HmsBuilder(SEXP ptype_sexp) : VctrBuilder(VECTOR_TYPE_HMS, ptype_sexp) {} -}; - -class PosixctBuilder : public VctrBuilder { - public: - explicit PosixctBuilder(SEXP ptype_sexp) - : VctrBuilder(VECTOR_TYPE_POSIXCT, ptype_sexp) {} -}; - -class DifftimeBuilder : public VctrBuilder { - public: - explicit DifftimeBuilder(SEXP ptype_sexp) - : VctrBuilder(VECTOR_TYPE_DIFFTIME, ptype_sexp) {} -}; - -class OtherBuilder : public VctrBuilder { - public: - explicit OtherBuilder(SEXP ptype_sexp) : VctrBuilder(VECTOR_TYPE_OTHER, ptype_sexp) {} -}; - -class ListOfBuilder : public VctrBuilder { - public: - explicit ListOfBuilder(SEXP ptype_sexp) - : VctrBuilder(VECTOR_TYPE_LIST_OF, ptype_sexp) {} -}; - -class RcrdBuilder : public VctrBuilder { - public: - explicit RcrdBuilder(SEXP ptype_sexp) - : VctrBuilder(VECTOR_TYPE_DATA_FRAME, ptype_sexp) {} - - ArrowErrorCode Init(const ArrowSchema* schema, VctrBuilderOptions options, - ArrowError* error) override { - NANOARROW_RETURN_NOT_OK(VctrBuilder::Init(schema, options, error)); - - // TODO: Check can convert here - - // Instantiate and initialize children - children_.resize(schema->n_children); - for (int64_t i = 0; i < schema->n_children; i++) { - SEXP child_ptype_sexp; - if (ptype_sexp_ != R_NilValue) { - child_ptype_sexp = VECTOR_ELT(ptype_sexp_, i); - } else { - child_ptype_sexp = R_NilValue; - } - - VctrBuilder* child = nullptr; - NANOARROW_RETURN_NOT_OK(InstantiateBuilder(schema->children[i], child_ptype_sexp, - options, &child, error)); - children_[i].reset(child); - NANOARROW_RETURN_NOT_OK(child->Init(schema->children[i], options, error)); - } - - schema_ = schema; - return NANOARROW_OK; - } - - SEXP GetPtype() override { - if (ptype_sexp_ != R_NilValue) { - return ptype_sexp_; - } - - SEXP result = PROTECT(Rf_allocVector(VECSXP, schema_->n_children)); - SEXP result_names = PROTECT(Rf_allocVector(STRSXP, schema_->n_children)); - for (R_xlen_t i = 0; i < schema_->n_children; i++) { - struct ArrowSchema* child = schema_->children[i]; - if (child->name != NULL) { - SET_STRING_ELT(result_names, i, Rf_mkCharCE(child->name, CE_UTF8)); - } else { - SET_STRING_ELT(result_names, i, Rf_mkChar("")); - } - - SEXP child_sexp = PROTECT(children_[i]->GetPtype()); - SET_VECTOR_ELT(result, i, child_sexp); - UNPROTECT(1); - } - - Rf_setAttrib(result, R_ClassSymbol, nanoarrow_cls_data_frame); - Rf_setAttrib(result, R_NamesSymbol, result_names); - SEXP rownames = PROTECT(Rf_allocVector(INTSXP, 2)); - INTEGER(rownames)[0] = NA_INTEGER; - INTEGER(rownames)[1] = 0; - Rf_setAttrib(result, R_RowNamesSymbol, rownames); - UNPROTECT(3); - return result; - } - - private: - std::vector> children_; -}; - // A base method for when we already have the VectorType and have already // resolved the ptype_sexp (if needed). static ArrowErrorCode InstantiateBuilderBase(const ArrowSchema* schema, diff --git a/r/src/vctr_builder_base.h b/r/src/vctr_builder_base.h index 0b4f05ddc..3db311084 100644 --- a/r/src/vctr_builder_base.h +++ b/r/src/vctr_builder_base.h @@ -90,4 +90,9 @@ struct VctrBuilder { const ArrowSchema* schema_; }; +// Resolve a builder class from a schema and (optional) ptype and instantiate it +ArrowErrorCode InstantiateBuilder(const ArrowSchema* schema, SEXP ptype_sexp, + VctrBuilderOptions options, VctrBuilder** out, + ArrowError* error); + #endif diff --git a/r/src/vctr_builder_list_of.h b/r/src/vctr_builder_list_of.h new file mode 100644 index 000000000..e100fa469 --- /dev/null +++ b/r/src/vctr_builder_list_of.h @@ -0,0 +1,33 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef R_NANOARROW_VCTR_BUILDER_LIST_OF_H_INCLUDED +#define R_NANOARROW_VCTR_BUILDER_LIST_OF_H_INCLUDED + +#define R_NO_REMAP +#include +#include + +#include "vctr_builder_base.h" + +class ListOfBuilder : public VctrBuilder { + public: + explicit ListOfBuilder(SEXP ptype_sexp) + : VctrBuilder(VECTOR_TYPE_LIST_OF, ptype_sexp) {} +}; + +#endif diff --git a/r/src/vctr_builder_primitive.h b/r/src/vctr_builder_primitive.h new file mode 100644 index 000000000..562c4381d --- /dev/null +++ b/r/src/vctr_builder_primitive.h @@ -0,0 +1,103 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef R_NANOARROW_VCTR_BUILDER_PRIMITIVE_H_INCLUDED +#define R_NANOARROW_VCTR_BUILDER_PRIMITIVE_H_INCLUDED + +#define R_NO_REMAP +#include +#include + +#include "vctr_builder_base.h" + +class UnspecifiedBuilder : public VctrBuilder { + public: + explicit UnspecifiedBuilder(SEXP ptype_sexp) + : VctrBuilder(VECTOR_TYPE_UNSPECIFIED, ptype_sexp) {} +}; + +class IntBuilder : public VctrBuilder { + public: + explicit IntBuilder(SEXP ptype_sexp) : VctrBuilder(VECTOR_TYPE_INT, ptype_sexp) {} + + SEXP GetPtype() override { return Rf_allocVector(INTSXP, 0); } +}; + +class DblBuilder : public VctrBuilder { + public: + explicit DblBuilder(SEXP ptype_sexp) : VctrBuilder(VECTOR_TYPE_DBL, ptype_sexp) {} + + SEXP GetPtype() override { return Rf_allocVector(REALSXP, 0); } +}; + +class LglBuilder : public VctrBuilder { + public: + explicit LglBuilder(SEXP ptype_sexp) : VctrBuilder(VECTOR_TYPE_LGL, ptype_sexp) {} + + SEXP GetPtype() override { return Rf_allocVector(LGLSXP, 0); } +}; + +class Integer64Builder : public VctrBuilder { + public: + explicit Integer64Builder(SEXP ptype_sexp) + : VctrBuilder(VECTOR_TYPE_INTEGER64, ptype_sexp) {} +}; + +class ChrBuilder : public VctrBuilder { + public: + explicit ChrBuilder(SEXP ptype_sexp) + : VctrBuilder(VECTOR_TYPE_CHR, ptype_sexp), + use_altrep_(VCTR_BUILDER_USE_ALTREP_DEFAULT) {} + + SEXP GetPtype() override { return Rf_allocVector(STRSXP, 0); } + + VctrBuilderUseAltrep use_altrep_; +}; + +class BlobBuilder : public VctrBuilder { + public: + explicit BlobBuilder(SEXP ptype_sexp) : VctrBuilder(VECTOR_TYPE_BLOB, ptype_sexp) {} +}; + +class DateBuilder : public VctrBuilder { + public: + explicit DateBuilder(SEXP ptype_sexp) : VctrBuilder(VECTOR_TYPE_DATE, ptype_sexp) {} +}; + +class HmsBuilder : public VctrBuilder { + public: + explicit HmsBuilder(SEXP ptype_sexp) : VctrBuilder(VECTOR_TYPE_HMS, ptype_sexp) {} +}; + +class PosixctBuilder : public VctrBuilder { + public: + explicit PosixctBuilder(SEXP ptype_sexp) + : VctrBuilder(VECTOR_TYPE_POSIXCT, ptype_sexp) {} +}; + +class DifftimeBuilder : public VctrBuilder { + public: + explicit DifftimeBuilder(SEXP ptype_sexp) + : VctrBuilder(VECTOR_TYPE_DIFFTIME, ptype_sexp) {} +}; + +class OtherBuilder : public VctrBuilder { + public: + explicit OtherBuilder(SEXP ptype_sexp) : VctrBuilder(VECTOR_TYPE_OTHER, ptype_sexp) {} +}; + +#endif diff --git a/r/src/vctr_builder_rcrd.h b/r/src/vctr_builder_rcrd.h new file mode 100644 index 000000000..99856655a --- /dev/null +++ b/r/src/vctr_builder_rcrd.h @@ -0,0 +1,96 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef R_NANOARROW_VCTR_BUILDER_RCRD_H_INCLUDED +#define R_NANOARROW_VCTR_BUILDER_RCRD_H_INCLUDED + +#define R_NO_REMAP +#include +#include + +#include +#include + +#include "vctr_builder_base.h" + +class RcrdBuilder : public VctrBuilder { + public: + explicit RcrdBuilder(SEXP ptype_sexp) + : VctrBuilder(VECTOR_TYPE_DATA_FRAME, ptype_sexp) {} + + ArrowErrorCode Init(const ArrowSchema* schema, VctrBuilderOptions options, + ArrowError* error) override { + NANOARROW_RETURN_NOT_OK(VctrBuilder::Init(schema, options, error)); + + // TODO: Check can convert here + + // Instantiate and initialize children + children_.resize(schema->n_children); + for (int64_t i = 0; i < schema->n_children; i++) { + SEXP child_ptype_sexp; + if (ptype_sexp_ != R_NilValue) { + child_ptype_sexp = VECTOR_ELT(ptype_sexp_, i); + } else { + child_ptype_sexp = R_NilValue; + } + + VctrBuilder* child = nullptr; + NANOARROW_RETURN_NOT_OK(InstantiateBuilder(schema->children[i], child_ptype_sexp, + options, &child, error)); + children_[i].reset(child); + NANOARROW_RETURN_NOT_OK(child->Init(schema->children[i], options, error)); + } + + schema_ = schema; + return NANOARROW_OK; + } + + SEXP GetPtype() override { + if (ptype_sexp_ != R_NilValue) { + return ptype_sexp_; + } + + SEXP result = PROTECT(Rf_allocVector(VECSXP, schema_->n_children)); + SEXP result_names = PROTECT(Rf_allocVector(STRSXP, schema_->n_children)); + for (R_xlen_t i = 0; i < schema_->n_children; i++) { + struct ArrowSchema* child = schema_->children[i]; + if (child->name != NULL) { + SET_STRING_ELT(result_names, i, Rf_mkCharCE(child->name, CE_UTF8)); + } else { + SET_STRING_ELT(result_names, i, Rf_mkChar("")); + } + + SEXP child_sexp = PROTECT(children_[i]->GetPtype()); + SET_VECTOR_ELT(result, i, child_sexp); + UNPROTECT(1); + } + + Rf_setAttrib(result, R_ClassSymbol, nanoarrow_cls_data_frame); + Rf_setAttrib(result, R_NamesSymbol, result_names); + SEXP rownames = PROTECT(Rf_allocVector(INTSXP, 2)); + INTEGER(rownames)[0] = NA_INTEGER; + INTEGER(rownames)[1] = 0; + Rf_setAttrib(result, R_RowNamesSymbol, rownames); + UNPROTECT(3); + return result; + } + + private: + std::vector> children_; +}; + +#endif From 5e7df941c7080775317c2bcd925ddd13c8afe003 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 28 Feb 2024 00:06:19 -0400 Subject: [PATCH 14/36] first conversion --- r/src/vctr_builder_base.h | 42 +++++++++++++++++++++++++++++--- r/src/vctr_builder_primitive.h | 44 ++++++++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+), 3 deletions(-) diff --git a/r/src/vctr_builder_base.h b/r/src/vctr_builder_base.h index 3db311084..4912963fe 100644 --- a/r/src/vctr_builder_base.h +++ b/r/src/vctr_builder_base.h @@ -35,7 +35,11 @@ struct VctrBuilder { // for inspecting their value. This does not validate any ptypes (that would // happen in Init() if needed). VctrBuilder(VectorType vector_type, SEXP ptype_sexp) - : vector_type_(vector_type), ptype_sexp_(R_NilValue), value_(R_NilValue) { + : schema_(nullptr), + vector_type_(vector_type), + ptype_sexp_(R_NilValue), + value_(R_NilValue), + value_size_(0) { nanoarrow_preserve_sexp(ptype_sexp); ptype_sexp_ = ptype_sexp; } @@ -55,6 +59,14 @@ struct VctrBuilder { return NANOARROW_OK; } + virtual ArrowErrorCode Reserve(R_xlen_t n, ArrowError* error) { + if (value_ != R_NilValue) { + ArrowErrorSet(error, "VctrBuilder reallocation is not implemented"); + } + + return NANOARROW_OK; + } + // Push an array into this builder and do not take ownership of array. This is // called when the caller cannot safely relinquish ownership of an array (e.g., // convert_array()). Calling this method may longjmp. @@ -71,7 +83,13 @@ struct VctrBuilder { // Perform any final calculations required to calculate the return value. // Calling this method may longjmp. - virtual ArrowErrorCode Finish(ArrowError* error) { return NANOARROW_OK; } + virtual ArrowErrorCode Finish(ArrowError* error) { + if (ptype_sexp_ != R_NilValue && value_ != R_NilValue) { + Rf_copyMostAttrib(ptype_sexp_, value_); + } + + return NANOARROW_OK; + } // Release the final value of the builder. Calling this method may longjmp. virtual SEXP GetValue() { @@ -84,10 +102,28 @@ struct VctrBuilder { virtual SEXP GetPtype() { return ptype_sexp_; } protected: + const ArrowSchema* schema_; VectorType vector_type_; SEXP ptype_sexp_; SEXP value_; - const ArrowSchema* schema_; + R_xlen_t value_size_; + + // Could maybe avoid a preserve/protect + void SetValue(SEXP value) { + nanoarrow_release_sexp(value_); + value_ = value; + nanoarrow_preserve_sexp(value_); + } + + ArrowErrorCode WarnLossyConvert(const char* msg, int64_t count) { + SEXP fun = PROTECT(Rf_install("warn_lossy_conversion")); + SEXP count_sexp = PROTECT(Rf_ScalarReal((double)count)); + SEXP msg_sexp = PROTECT(Rf_mkString(msg)); + SEXP call = PROTECT(Rf_lang3(fun, count_sexp, msg_sexp)); + Rf_eval(call, nanoarrow_ns_pkg); + UNPROTECT(4); + return NANOARROW_OK; + } }; // Resolve a builder class from a schema and (optional) ptype and instantiate it diff --git a/r/src/vctr_builder_primitive.h b/r/src/vctr_builder_primitive.h index 562c4381d..25d5a28d8 100644 --- a/r/src/vctr_builder_primitive.h +++ b/r/src/vctr_builder_primitive.h @@ -28,6 +28,50 @@ class UnspecifiedBuilder : public VctrBuilder { public: explicit UnspecifiedBuilder(SEXP ptype_sexp) : VctrBuilder(VECTOR_TYPE_UNSPECIFIED, ptype_sexp) {} + + ArrowErrorCode Init(const ArrowSchema* schema, VctrBuilderOptions options, + ArrowError* error) override { + NANOARROW_RETURN_NOT_OK(VctrBuilder::Init(schema, options, error)); + if (schema->dictionary != nullptr) { + ArrowErrorSet(error, "Can't convert dictionary to vctrs::unspecified()"); + return ENOTSUP; + } + + return NANOARROW_OK; + } + + ArrowErrorCode Reserve(R_xlen_t n, ArrowError* error) override { + NANOARROW_RETURN_NOT_OK(VctrBuilder::Reserve(n, error)); + value_ = PROTECT(Rf_allocVector(LGLSXP, n)); + SetValue(value_); + UNPROTECT(1); + return NANOARROW_OK; + } + + ArrowErrorCode PushNext(const ArrowArray* array, ArrowError* error) override { + int64_t not_null_count; + if (array->null_count == -1 && array->buffers[0] == nullptr) { + not_null_count = array->length; + } else if (array->null_count == -1) { + not_null_count = + ArrowBitCountSet(reinterpret_cast(array->buffers[0]), + array->offset, array->length); + } else { + not_null_count = array->length - array->null_count; + } + + if (not_null_count > 0 && array->length > 0) { + NANOARROW_RETURN_NOT_OK( + WarnLossyConvert("that were non-null set to NA", not_null_count)); + } + + int* value_ptr = LOGICAL(value_) + value_size_; + for (int64_t i = 0; i < array->length; i++) { + value_ptr[i] = NA_LOGICAL; + } + + return NANOARROW_OK; + } }; class IntBuilder : public VctrBuilder { From c1e20ac8a144af63519050e78149c25346a48808 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 29 Feb 2024 22:44:39 -0400 Subject: [PATCH 15/36] add an array view --- r/src/vctr_builder_base.h | 6 ++++++ r/src/vctr_builder_rcrd.h | 2 -- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/r/src/vctr_builder_base.h b/r/src/vctr_builder_base.h index 4912963fe..d2ef2aae5 100644 --- a/r/src/vctr_builder_base.h +++ b/r/src/vctr_builder_base.h @@ -40,6 +40,7 @@ struct VctrBuilder { ptype_sexp_(R_NilValue), value_(R_NilValue), value_size_(0) { + ArrowArrayViewInitFromType(&array_view_, NANOARROW_TYPE_UNINITIALIZED); nanoarrow_preserve_sexp(ptype_sexp); ptype_sexp_ = ptype_sexp; } @@ -48,14 +49,18 @@ struct VctrBuilder { virtual ~VctrBuilder() { nanoarrow_release_sexp(ptype_sexp_); nanoarrow_release_sexp(value_); + ArrowArrayViewReset(&array_view_); } // Initialize this instance with the information available to the resolver, or the // information that was inferred. If using the default `to`, ptype may be R_NilValue // with Options containing the inferred information. Calling this method may longjmp. + // The implementation on the base class initialized the built-in ArrowArrayView and + // saves a reference to `schema` (but subclass implementations need not call it). virtual ArrowErrorCode Init(const ArrowSchema* schema, VctrBuilderOptions options, ArrowError* error) { schema_ = schema; + NANOARROW_RETURN_NOT_OK(ArrowArrayViewInitFromSchema(&array_view_, schema, error)); return NANOARROW_OK; } @@ -102,6 +107,7 @@ struct VctrBuilder { virtual SEXP GetPtype() { return ptype_sexp_; } protected: + ArrowArrayView array_view_; const ArrowSchema* schema_; VectorType vector_type_; SEXP ptype_sexp_; diff --git a/r/src/vctr_builder_rcrd.h b/r/src/vctr_builder_rcrd.h index 99856655a..d0f7f9fa9 100644 --- a/r/src/vctr_builder_rcrd.h +++ b/r/src/vctr_builder_rcrd.h @@ -34,8 +34,6 @@ class RcrdBuilder : public VctrBuilder { ArrowErrorCode Init(const ArrowSchema* schema, VctrBuilderOptions options, ArrowError* error) override { - NANOARROW_RETURN_NOT_OK(VctrBuilder::Init(schema, options, error)); - // TODO: Check can convert here // Instantiate and initialize children From 78d3d81ccf1a4d73dda8367d4dbda35ac3ae9d18 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 29 Feb 2024 23:16:54 -0400 Subject: [PATCH 16/36] maybe have conversion errors --- r/src/vctr_builder_base.h | 15 ++++++++++++++- r/src/vctr_builder_primitive.h | 8 +++++--- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/r/src/vctr_builder_base.h b/r/src/vctr_builder_base.h index d2ef2aae5..3660de15a 100644 --- a/r/src/vctr_builder_base.h +++ b/r/src/vctr_builder_base.h @@ -59,8 +59,9 @@ struct VctrBuilder { // saves a reference to `schema` (but subclass implementations need not call it). virtual ArrowErrorCode Init(const ArrowSchema* schema, VctrBuilderOptions options, ArrowError* error) { - schema_ = schema; + NANOARROW_RETURN_NOT_OK(ArrowSchemaViewInit(&schema_view_, schema, error)); NANOARROW_RETURN_NOT_OK(ArrowArrayViewInitFromSchema(&array_view_, schema, error)); + schema_ = schema; return NANOARROW_OK; } @@ -107,6 +108,7 @@ struct VctrBuilder { virtual SEXP GetPtype() { return ptype_sexp_; } protected: + ArrowSchemaView schema_view_; ArrowArrayView array_view_; const ArrowSchema* schema_; VectorType vector_type_; @@ -130,6 +132,17 @@ struct VctrBuilder { UNPROTECT(4); return NANOARROW_OK; } + + void StopCantConvert() { + SEXP fun = PROTECT(Rf_install("stop_cant_convert_schema")); + SEXP schema_xptr = PROTECT( + R_MakeExternalPtr(const_cast(schema_), R_NilValue, R_NilValue)); + Rf_setAttrib(schema_xptr, R_ClassSymbol, nanoarrow_cls_schema); + + SEXP call = PROTECT(Rf_lang2(fun, schema_xptr)); + Rf_eval(call, nanoarrow_ns_pkg); + UNPROTECT(3); + } }; // Resolve a builder class from a schema and (optional) ptype and instantiate it diff --git a/r/src/vctr_builder_primitive.h b/r/src/vctr_builder_primitive.h index 25d5a28d8..0c09006e8 100644 --- a/r/src/vctr_builder_primitive.h +++ b/r/src/vctr_builder_primitive.h @@ -32,9 +32,11 @@ class UnspecifiedBuilder : public VctrBuilder { ArrowErrorCode Init(const ArrowSchema* schema, VctrBuilderOptions options, ArrowError* error) override { NANOARROW_RETURN_NOT_OK(VctrBuilder::Init(schema, options, error)); - if (schema->dictionary != nullptr) { - ArrowErrorSet(error, "Can't convert dictionary to vctrs::unspecified()"); - return ENOTSUP; + switch (schema_view_.type) { + case NANOARROW_TYPE_DICTIONARY: + StopCantConvert(); + default: + break; } return NANOARROW_OK; From 7e35f5de992dc30c533c84f4e3e5eaa4a5d7d691 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 29 Feb 2024 23:28:17 -0400 Subject: [PATCH 17/36] maybe some actual conversions --- r/src/vctr_builder_primitive.h | 120 +++++++++++++++++++++++++++++++++ 1 file changed, 120 insertions(+) diff --git a/r/src/vctr_builder_primitive.h b/r/src/vctr_builder_primitive.h index 0c09006e8..33f996150 100644 --- a/r/src/vctr_builder_primitive.h +++ b/r/src/vctr_builder_primitive.h @@ -80,6 +80,126 @@ class IntBuilder : public VctrBuilder { public: explicit IntBuilder(SEXP ptype_sexp) : VctrBuilder(VECTOR_TYPE_INT, ptype_sexp) {} + ArrowErrorCode Init(const ArrowSchema* schema, VctrBuilderOptions options, + ArrowError* error) override { + NANOARROW_RETURN_NOT_OK(VctrBuilder::Init(schema, options, error)); + return NANOARROW_OK; + } + + ArrowErrorCode Reserve(R_xlen_t n, ArrowError* error) override { + NANOARROW_RETURN_NOT_OK(VctrBuilder::Reserve(n, error)); + value_ = PROTECT(Rf_allocVector(INTSXP, n)); + SetValue(value_); + UNPROTECT(1); + return NANOARROW_OK; + } + + ArrowErrorCode PushNext(const ArrowArray* array, ArrowError* error) override { + int* result = INTEGER(value_); + int64_t n_bad_values = 0; + + // True for all the types supported here + const uint8_t* is_valid = array_view_.buffer_views[0].data.as_uint8; + int64_t raw_src_offset = array_view_.array->offset; + R_xlen_t length = array->length; + + // Fill the buffer + switch (array_view_.storage_type) { + case NANOARROW_TYPE_NA: + for (R_xlen_t i = 0; i < length; i++) { + result[value_size_ + i] = NA_INTEGER; + } + break; + case NANOARROW_TYPE_INT32: + memcpy(result + value_size_, + array_view_.buffer_views[1].data.as_int32 + raw_src_offset, + length * sizeof(int32_t)); + + // Set any nulls to NA_INTEGER + if (is_valid != NULL && array_view_.array->null_count != 0) { + for (R_xlen_t i = 0; i < length; i++) { + if (!ArrowBitGet(is_valid, raw_src_offset + i)) { + result[value_size_ + i] = NA_INTEGER; + } + } + } + break; + case NANOARROW_TYPE_BOOL: + ArrowBitsUnpackInt32(array_view_.buffer_views[1].data.as_uint8 + raw_src_offset, + raw_src_offset, length, result + value_size_); + + // Set any nulls to NA_LOGICAL + if (is_valid != NULL && array_view_.array->null_count != 0) { + for (R_xlen_t i = 0; i < length; i++) { + if (!ArrowBitGet(is_valid, raw_src_offset + i)) { + result[value_size_ + i] = NA_LOGICAL; + } + } + } + break; + case NANOARROW_TYPE_INT8: + case NANOARROW_TYPE_UINT8: + case NANOARROW_TYPE_INT16: + case NANOARROW_TYPE_UINT16: + // No need to bounds check for these types + for (R_xlen_t i = 0; i < length; i++) { + result[value_size_ + i] = (int32_t)ArrowArrayViewGetIntUnsafe(&array_view_, i); + } + + // Set any nulls to NA_INTEGER + if (is_valid != NULL && array_view_.array->null_count != 0) { + for (R_xlen_t i = 0; i < length; i++) { + if (!ArrowBitGet(is_valid, raw_src_offset + i)) { + result[value_size_ + i] = NA_INTEGER; + } + } + } + break; + case NANOARROW_TYPE_UINT32: + case NANOARROW_TYPE_INT64: + case NANOARROW_TYPE_UINT64: + case NANOARROW_TYPE_FLOAT: + case NANOARROW_TYPE_DOUBLE: + // Loop + bounds check. Because we don't know what memory might be + // in a null slot, we have to check nulls if there are any. + if (is_valid != NULL && array_view_.array->null_count != 0) { + for (R_xlen_t i = 0; i < length; i++) { + if (ArrowBitGet(is_valid, raw_src_offset + i)) { + int64_t value = ArrowArrayViewGetIntUnsafe(&array_view_, i); + if (value > INT_MAX || value <= NA_INTEGER) { + result[value_size_ + i] = NA_INTEGER; + n_bad_values++; + } else { + result[value_size_ + i] = (int32_t)value; + } + } else { + result[value_size_ + i] = NA_INTEGER; + } + } + } else { + for (R_xlen_t i = 0; i < length; i++) { + int64_t value = ArrowArrayViewGetIntUnsafe(&array_view_, i); + if (value > INT_MAX || value <= NA_INTEGER) { + result[value_size_ + i] = NA_INTEGER; + n_bad_values++; + } else { + result[value_size_ + i] = (int32_t)value; + } + } + } + break; + + default: + return EINVAL; + } + + if (n_bad_values > 0) { + warn_lossy_conversion(n_bad_values, "outside integer range set to NA"); + } + + return NANOARROW_OK; + } + SEXP GetPtype() override { return Rf_allocVector(INTSXP, 0); } }; From d269d8b29b5c7ddb6f3aa84083bfeb1de911c534 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 29 Feb 2024 23:29:43 -0400 Subject: [PATCH 18/36] fix can't convert --- r/src/vctr_builder_base.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/r/src/vctr_builder_base.h b/r/src/vctr_builder_base.h index 3660de15a..662d20345 100644 --- a/r/src/vctr_builder_base.h +++ b/r/src/vctr_builder_base.h @@ -138,10 +138,11 @@ struct VctrBuilder { SEXP schema_xptr = PROTECT( R_MakeExternalPtr(const_cast(schema_), R_NilValue, R_NilValue)); Rf_setAttrib(schema_xptr, R_ClassSymbol, nanoarrow_cls_schema); + SEXP ptype_sexp = PROTECT(GetPtype()); - SEXP call = PROTECT(Rf_lang2(fun, schema_xptr)); + SEXP call = PROTECT(Rf_lang3(fun, schema_xptr, ptype_sexp)); Rf_eval(call, nanoarrow_ns_pkg); - UNPROTECT(3); + UNPROTECT(4); } }; From a931c5b704e2ccc28f7a03bda231afa4ea1d2184 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 1 Mar 2024 00:00:59 -0400 Subject: [PATCH 19/36] working for integers --- r/src/init.c | 2 ++ r/src/vctr_builder.cc | 31 ++++++++++++++++++++++++++++++- r/src/vctr_builder.h | 2 ++ r/src/vctr_builder_base.h | 7 +++++-- r/src/vctr_builder_primitive.h | 20 +++++++++++--------- 5 files changed, 50 insertions(+), 12 deletions(-) diff --git a/r/src/init.c b/r/src/init.c index 2e5c07967..b472cfbcb 100644 --- a/r/src/init.c +++ b/r/src/init.c @@ -97,6 +97,7 @@ extern SEXP nanoarrow_c_vctr_chunk_resolve(SEXP indices_sexp, SEXP offsets_sexp) extern SEXP nanoarrow_c_vctr_as_slice(SEXP indices_sexp); extern SEXP nanoarrow_c_infer_ptype_using_builder(SEXP schema_xptr, SEXP ptype_sexp); extern SEXP nanoarrow_c_infer_ptype_using_builder(SEXP schema_xptr); +extern SEXP nanoarrow_c_convert_array2(SEXP array_xptr, SEXP ptype_sexp); extern SEXP nanoarrow_c_version(void); extern SEXP nanoarrow_c_version_runtime(void); @@ -176,6 +177,7 @@ static const R_CallMethodDef CallEntries[] = { {"nanoarrow_c_vctr_as_slice", (DL_FUNC)&nanoarrow_c_vctr_as_slice, 1}, {"nanoarrow_c_infer_ptype_using_builder", (DL_FUNC)&nanoarrow_c_infer_ptype_using_builder, 1}, + {"nanoarrow_c_convert_array2", (DL_FUNC)&nanoarrow_c_convert_array2, 2}, {"nanoarrow_c_version", (DL_FUNC)&nanoarrow_c_version, 0}, {"nanoarrow_c_version_runtime", (DL_FUNC)&nanoarrow_c_version_runtime, 0}, {NULL, NULL, 0}}; diff --git a/r/src/vctr_builder.cc b/r/src/vctr_builder.cc index 45c4a2111..2c8b79df7 100644 --- a/r/src/vctr_builder.cc +++ b/r/src/vctr_builder.cc @@ -22,15 +22,16 @@ #include #include +#include "array.h" #include "materialize.h" #include "nanoarrow.h" #include "nanoarrow/r.h" #include "vctr_builder.h" #include "vctr_builder_base.h" +#include "vctr_builder_list_of.h" #include "vctr_builder_primitive.h" #include "vctr_builder_rcrd.h" -#include "vctr_builder_list_of.h" // These conversions are the default R-native type guesses for // an array that don't require extra information from the ptype (e.g., @@ -276,3 +277,31 @@ SEXP nanoarrow_c_infer_ptype(SEXP schema_xptr) { UNPROTECT(2); return ptype_sexp; } + +SEXP nanoarrow_c_convert_array2(SEXP array_xptr, SEXP ptype_sexp) { + ArrowArray* array = nanoarrow_array_from_xptr(array_xptr); + SEXP schema_xptr = PROTECT(array_xptr_get_schema(array_xptr)); + SEXP builder_xptr = PROTECT(nanoarrow_vctr_builder_init(schema_xptr, ptype_sexp)); + auto builder = reinterpret_cast(R_ExternalPtrAddr(builder_xptr)); + + ArrowError error; + ArrowErrorInit(&error); + + int result = builder->Reserve(array->length, &error); + if (result != NANOARROW_OK) { + Rf_error("builder->Reserve() failed: %s", error.message); + } + + result = builder->PushNext(array, &error); + if (result != NANOARROW_OK) { + Rf_error("builder->PushNext() failed: %s", error.message); + } + + result = builder->Finish(&error); + if (result != NANOARROW_OK) { + Rf_error("builder->Finish() failed: %s", error.message); + } + + UNPROTECT(2); + return builder->GetValue(); +} diff --git a/r/src/vctr_builder.h b/r/src/vctr_builder.h index a037c9513..91e16f1c4 100644 --- a/r/src/vctr_builder.h +++ b/r/src/vctr_builder.h @@ -54,6 +54,8 @@ SEXP nanoarrow_vctr_builder_init(SEXP schema_xptr, SEXP ptype_sexp); SEXP nanoarrow_c_infer_ptype(SEXP schema_xptr); +SEXP nanoarrow_c_convert_array2(SEXP array_xptr, SEXP ptype_sexp); + #ifdef __cplusplus } #endif diff --git a/r/src/vctr_builder_base.h b/r/src/vctr_builder_base.h index 662d20345..9bd64c27f 100644 --- a/r/src/vctr_builder_base.h +++ b/r/src/vctr_builder_base.h @@ -77,7 +77,8 @@ struct VctrBuilder { // called when the caller cannot safely relinquish ownership of an array (e.g., // convert_array()). Calling this method may longjmp. virtual ArrowErrorCode PushNext(const ArrowArray* array, ArrowError* error) { - return ENOTSUP; + NANOARROW_RETURN_NOT_OK(ArrowArrayViewSetArray(&array_view_, array, error)); + return NANOARROW_OK; } // Push an array into this builder. The implementation may (but is not required) to take @@ -99,9 +100,11 @@ struct VctrBuilder { // Release the final value of the builder. Calling this method may longjmp. virtual SEXP GetValue() { + SEXP value = PROTECT(value_); nanoarrow_release_sexp(value_); value_ = R_NilValue; - return value_; + UNPROTECT(1); + return value; } // Get (or allocate if required) the SEXP ptype for this output diff --git a/r/src/vctr_builder_primitive.h b/r/src/vctr_builder_primitive.h index 33f996150..e058be217 100644 --- a/r/src/vctr_builder_primitive.h +++ b/r/src/vctr_builder_primitive.h @@ -44,8 +44,8 @@ class UnspecifiedBuilder : public VctrBuilder { ArrowErrorCode Reserve(R_xlen_t n, ArrowError* error) override { NANOARROW_RETURN_NOT_OK(VctrBuilder::Reserve(n, error)); - value_ = PROTECT(Rf_allocVector(LGLSXP, n)); - SetValue(value_); + SEXP value = PROTECT(Rf_allocVector(LGLSXP, n)); + SetValue(value); UNPROTECT(1); return NANOARROW_OK; } @@ -88,19 +88,21 @@ class IntBuilder : public VctrBuilder { ArrowErrorCode Reserve(R_xlen_t n, ArrowError* error) override { NANOARROW_RETURN_NOT_OK(VctrBuilder::Reserve(n, error)); - value_ = PROTECT(Rf_allocVector(INTSXP, n)); - SetValue(value_); + SEXP value = PROTECT(Rf_allocVector(INTSXP, n)); + SetValue(value); UNPROTECT(1); return NANOARROW_OK; } ArrowErrorCode PushNext(const ArrowArray* array, ArrowError* error) override { + NANOARROW_RETURN_NOT_OK(VctrBuilder::PushNext(array, error)); + int* result = INTEGER(value_); int64_t n_bad_values = 0; // True for all the types supported here const uint8_t* is_valid = array_view_.buffer_views[0].data.as_uint8; - int64_t raw_src_offset = array_view_.array->offset; + int64_t raw_src_offset = array_view_.offset; R_xlen_t length = array->length; // Fill the buffer @@ -116,7 +118,7 @@ class IntBuilder : public VctrBuilder { length * sizeof(int32_t)); // Set any nulls to NA_INTEGER - if (is_valid != NULL && array_view_.array->null_count != 0) { + if (is_valid != NULL && array_view_.null_count != 0) { for (R_xlen_t i = 0; i < length; i++) { if (!ArrowBitGet(is_valid, raw_src_offset + i)) { result[value_size_ + i] = NA_INTEGER; @@ -129,7 +131,7 @@ class IntBuilder : public VctrBuilder { raw_src_offset, length, result + value_size_); // Set any nulls to NA_LOGICAL - if (is_valid != NULL && array_view_.array->null_count != 0) { + if (is_valid != NULL && array_view_.null_count != 0) { for (R_xlen_t i = 0; i < length; i++) { if (!ArrowBitGet(is_valid, raw_src_offset + i)) { result[value_size_ + i] = NA_LOGICAL; @@ -147,7 +149,7 @@ class IntBuilder : public VctrBuilder { } // Set any nulls to NA_INTEGER - if (is_valid != NULL && array_view_.array->null_count != 0) { + if (is_valid != NULL && array_view_.null_count != 0) { for (R_xlen_t i = 0; i < length; i++) { if (!ArrowBitGet(is_valid, raw_src_offset + i)) { result[value_size_ + i] = NA_INTEGER; @@ -162,7 +164,7 @@ class IntBuilder : public VctrBuilder { case NANOARROW_TYPE_DOUBLE: // Loop + bounds check. Because we don't know what memory might be // in a null slot, we have to check nulls if there are any. - if (is_valid != NULL && array_view_.array->null_count != 0) { + if (is_valid != NULL && array_view_.null_count != 0) { for (R_xlen_t i = 0; i < length; i++) { if (ArrowBitGet(is_valid, raw_src_offset + i)) { int64_t value = ArrowArrayViewGetIntUnsafe(&array_view_, i); From 9e38a65b13081ec703541861ff1dde91c5d542b2 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 6 Mar 2024 15:09:43 -0400 Subject: [PATCH 20/36] shuffle --- r/src/vctr_builder.cc | 2 + r/src/vctr_builder_int.h | 156 ++++++++++++++++++++++++++ r/src/vctr_builder_primitive.h | 181 ------------------------------- r/src/vctr_builder_unspecified.h | 79 ++++++++++++++ 4 files changed, 237 insertions(+), 181 deletions(-) create mode 100644 r/src/vctr_builder_int.h create mode 100644 r/src/vctr_builder_unspecified.h diff --git a/r/src/vctr_builder.cc b/r/src/vctr_builder.cc index 2c8b79df7..a0d454517 100644 --- a/r/src/vctr_builder.cc +++ b/r/src/vctr_builder.cc @@ -29,9 +29,11 @@ #include "vctr_builder.h" #include "vctr_builder_base.h" +#include "vctr_builder_int.h" #include "vctr_builder_list_of.h" #include "vctr_builder_primitive.h" #include "vctr_builder_rcrd.h" +#include "vctr_builder_unspecified.h" // These conversions are the default R-native type guesses for // an array that don't require extra information from the ptype (e.g., diff --git a/r/src/vctr_builder_int.h b/r/src/vctr_builder_int.h new file mode 100644 index 000000000..3b67fb3e6 --- /dev/null +++ b/r/src/vctr_builder_int.h @@ -0,0 +1,156 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef R_NANOARROW_VCTR_BUILDER_INT_H_INCLUDED +#define R_NANOARROW_VCTR_BUILDER_INT_H_INCLUDED + +#define R_NO_REMAP +#include +#include + +#include "vctr_builder_base.h" + +class IntBuilder : public VctrBuilder { + public: + explicit IntBuilder(SEXP ptype_sexp) : VctrBuilder(VECTOR_TYPE_INT, ptype_sexp) {} + + ArrowErrorCode Init(const ArrowSchema* schema, VctrBuilderOptions options, + ArrowError* error) override { + NANOARROW_RETURN_NOT_OK(VctrBuilder::Init(schema, options, error)); + return NANOARROW_OK; + } + + SEXP GetPtype() override { return Rf_allocVector(INTSXP, 0); } + + ArrowErrorCode Reserve(R_xlen_t n, ArrowError* error) override { + NANOARROW_RETURN_NOT_OK(VctrBuilder::Reserve(n, error)); + SEXP value = PROTECT(Rf_allocVector(INTSXP, n)); + SetValue(value); + UNPROTECT(1); + return NANOARROW_OK; + } + + ArrowErrorCode PushNext(const ArrowArray* array, ArrowError* error) override { + NANOARROW_RETURN_NOT_OK(VctrBuilder::PushNext(array, error)); + + int* result = INTEGER(value_); + int64_t n_bad_values = 0; + + // True for all the types supported here + const uint8_t* is_valid = array_view_.buffer_views[0].data.as_uint8; + int64_t raw_src_offset = array_view_.offset; + R_xlen_t length = array->length; + + // Fill the buffer + switch (array_view_.storage_type) { + case NANOARROW_TYPE_NA: + for (R_xlen_t i = 0; i < length; i++) { + result[value_size_ + i] = NA_INTEGER; + } + break; + case NANOARROW_TYPE_INT32: + memcpy(result + value_size_, + array_view_.buffer_views[1].data.as_int32 + raw_src_offset, + length * sizeof(int32_t)); + + // Set any nulls to NA_INTEGER + if (is_valid != NULL && array_view_.null_count != 0) { + for (R_xlen_t i = 0; i < length; i++) { + if (!ArrowBitGet(is_valid, raw_src_offset + i)) { + result[value_size_ + i] = NA_INTEGER; + } + } + } + break; + case NANOARROW_TYPE_BOOL: + ArrowBitsUnpackInt32(array_view_.buffer_views[1].data.as_uint8 + raw_src_offset, + raw_src_offset, length, result + value_size_); + + // Set any nulls to NA_LOGICAL + if (is_valid != NULL && array_view_.null_count != 0) { + for (R_xlen_t i = 0; i < length; i++) { + if (!ArrowBitGet(is_valid, raw_src_offset + i)) { + result[value_size_ + i] = NA_LOGICAL; + } + } + } + break; + case NANOARROW_TYPE_INT8: + case NANOARROW_TYPE_UINT8: + case NANOARROW_TYPE_INT16: + case NANOARROW_TYPE_UINT16: + // No need to bounds check for these types + for (R_xlen_t i = 0; i < length; i++) { + result[value_size_ + i] = (int32_t)ArrowArrayViewGetIntUnsafe(&array_view_, i); + } + + // Set any nulls to NA_INTEGER + if (is_valid != NULL && array_view_.null_count != 0) { + for (R_xlen_t i = 0; i < length; i++) { + if (!ArrowBitGet(is_valid, raw_src_offset + i)) { + result[value_size_ + i] = NA_INTEGER; + } + } + } + break; + case NANOARROW_TYPE_UINT32: + case NANOARROW_TYPE_INT64: + case NANOARROW_TYPE_UINT64: + case NANOARROW_TYPE_FLOAT: + case NANOARROW_TYPE_DOUBLE: + // Loop + bounds check. Because we don't know what memory might be + // in a null slot, we have to check nulls if there are any. + if (is_valid != NULL && array_view_.null_count != 0) { + for (R_xlen_t i = 0; i < length; i++) { + if (ArrowBitGet(is_valid, raw_src_offset + i)) { + int64_t value = ArrowArrayViewGetIntUnsafe(&array_view_, i); + if (value > INT_MAX || value <= NA_INTEGER) { + result[value_size_ + i] = NA_INTEGER; + n_bad_values++; + } else { + result[value_size_ + i] = (int32_t)value; + } + } else { + result[value_size_ + i] = NA_INTEGER; + } + } + } else { + for (R_xlen_t i = 0; i < length; i++) { + int64_t value = ArrowArrayViewGetIntUnsafe(&array_view_, i); + if (value > INT_MAX || value <= NA_INTEGER) { + result[value_size_ + i] = NA_INTEGER; + n_bad_values++; + } else { + result[value_size_ + i] = (int32_t)value; + } + } + } + break; + + default: + return EINVAL; + } + + if (n_bad_values > 0) { + WarnLossyConvert("outside integer range set to NA", n_bad_values); + } + + return NANOARROW_OK; + } +}; + +#endif diff --git a/r/src/vctr_builder_primitive.h b/r/src/vctr_builder_primitive.h index e058be217..115acde41 100644 --- a/r/src/vctr_builder_primitive.h +++ b/r/src/vctr_builder_primitive.h @@ -24,187 +24,6 @@ #include "vctr_builder_base.h" -class UnspecifiedBuilder : public VctrBuilder { - public: - explicit UnspecifiedBuilder(SEXP ptype_sexp) - : VctrBuilder(VECTOR_TYPE_UNSPECIFIED, ptype_sexp) {} - - ArrowErrorCode Init(const ArrowSchema* schema, VctrBuilderOptions options, - ArrowError* error) override { - NANOARROW_RETURN_NOT_OK(VctrBuilder::Init(schema, options, error)); - switch (schema_view_.type) { - case NANOARROW_TYPE_DICTIONARY: - StopCantConvert(); - default: - break; - } - - return NANOARROW_OK; - } - - ArrowErrorCode Reserve(R_xlen_t n, ArrowError* error) override { - NANOARROW_RETURN_NOT_OK(VctrBuilder::Reserve(n, error)); - SEXP value = PROTECT(Rf_allocVector(LGLSXP, n)); - SetValue(value); - UNPROTECT(1); - return NANOARROW_OK; - } - - ArrowErrorCode PushNext(const ArrowArray* array, ArrowError* error) override { - int64_t not_null_count; - if (array->null_count == -1 && array->buffers[0] == nullptr) { - not_null_count = array->length; - } else if (array->null_count == -1) { - not_null_count = - ArrowBitCountSet(reinterpret_cast(array->buffers[0]), - array->offset, array->length); - } else { - not_null_count = array->length - array->null_count; - } - - if (not_null_count > 0 && array->length > 0) { - NANOARROW_RETURN_NOT_OK( - WarnLossyConvert("that were non-null set to NA", not_null_count)); - } - - int* value_ptr = LOGICAL(value_) + value_size_; - for (int64_t i = 0; i < array->length; i++) { - value_ptr[i] = NA_LOGICAL; - } - - return NANOARROW_OK; - } -}; - -class IntBuilder : public VctrBuilder { - public: - explicit IntBuilder(SEXP ptype_sexp) : VctrBuilder(VECTOR_TYPE_INT, ptype_sexp) {} - - ArrowErrorCode Init(const ArrowSchema* schema, VctrBuilderOptions options, - ArrowError* error) override { - NANOARROW_RETURN_NOT_OK(VctrBuilder::Init(schema, options, error)); - return NANOARROW_OK; - } - - ArrowErrorCode Reserve(R_xlen_t n, ArrowError* error) override { - NANOARROW_RETURN_NOT_OK(VctrBuilder::Reserve(n, error)); - SEXP value = PROTECT(Rf_allocVector(INTSXP, n)); - SetValue(value); - UNPROTECT(1); - return NANOARROW_OK; - } - - ArrowErrorCode PushNext(const ArrowArray* array, ArrowError* error) override { - NANOARROW_RETURN_NOT_OK(VctrBuilder::PushNext(array, error)); - - int* result = INTEGER(value_); - int64_t n_bad_values = 0; - - // True for all the types supported here - const uint8_t* is_valid = array_view_.buffer_views[0].data.as_uint8; - int64_t raw_src_offset = array_view_.offset; - R_xlen_t length = array->length; - - // Fill the buffer - switch (array_view_.storage_type) { - case NANOARROW_TYPE_NA: - for (R_xlen_t i = 0; i < length; i++) { - result[value_size_ + i] = NA_INTEGER; - } - break; - case NANOARROW_TYPE_INT32: - memcpy(result + value_size_, - array_view_.buffer_views[1].data.as_int32 + raw_src_offset, - length * sizeof(int32_t)); - - // Set any nulls to NA_INTEGER - if (is_valid != NULL && array_view_.null_count != 0) { - for (R_xlen_t i = 0; i < length; i++) { - if (!ArrowBitGet(is_valid, raw_src_offset + i)) { - result[value_size_ + i] = NA_INTEGER; - } - } - } - break; - case NANOARROW_TYPE_BOOL: - ArrowBitsUnpackInt32(array_view_.buffer_views[1].data.as_uint8 + raw_src_offset, - raw_src_offset, length, result + value_size_); - - // Set any nulls to NA_LOGICAL - if (is_valid != NULL && array_view_.null_count != 0) { - for (R_xlen_t i = 0; i < length; i++) { - if (!ArrowBitGet(is_valid, raw_src_offset + i)) { - result[value_size_ + i] = NA_LOGICAL; - } - } - } - break; - case NANOARROW_TYPE_INT8: - case NANOARROW_TYPE_UINT8: - case NANOARROW_TYPE_INT16: - case NANOARROW_TYPE_UINT16: - // No need to bounds check for these types - for (R_xlen_t i = 0; i < length; i++) { - result[value_size_ + i] = (int32_t)ArrowArrayViewGetIntUnsafe(&array_view_, i); - } - - // Set any nulls to NA_INTEGER - if (is_valid != NULL && array_view_.null_count != 0) { - for (R_xlen_t i = 0; i < length; i++) { - if (!ArrowBitGet(is_valid, raw_src_offset + i)) { - result[value_size_ + i] = NA_INTEGER; - } - } - } - break; - case NANOARROW_TYPE_UINT32: - case NANOARROW_TYPE_INT64: - case NANOARROW_TYPE_UINT64: - case NANOARROW_TYPE_FLOAT: - case NANOARROW_TYPE_DOUBLE: - // Loop + bounds check. Because we don't know what memory might be - // in a null slot, we have to check nulls if there are any. - if (is_valid != NULL && array_view_.null_count != 0) { - for (R_xlen_t i = 0; i < length; i++) { - if (ArrowBitGet(is_valid, raw_src_offset + i)) { - int64_t value = ArrowArrayViewGetIntUnsafe(&array_view_, i); - if (value > INT_MAX || value <= NA_INTEGER) { - result[value_size_ + i] = NA_INTEGER; - n_bad_values++; - } else { - result[value_size_ + i] = (int32_t)value; - } - } else { - result[value_size_ + i] = NA_INTEGER; - } - } - } else { - for (R_xlen_t i = 0; i < length; i++) { - int64_t value = ArrowArrayViewGetIntUnsafe(&array_view_, i); - if (value > INT_MAX || value <= NA_INTEGER) { - result[value_size_ + i] = NA_INTEGER; - n_bad_values++; - } else { - result[value_size_ + i] = (int32_t)value; - } - } - } - break; - - default: - return EINVAL; - } - - if (n_bad_values > 0) { - warn_lossy_conversion(n_bad_values, "outside integer range set to NA"); - } - - return NANOARROW_OK; - } - - SEXP GetPtype() override { return Rf_allocVector(INTSXP, 0); } -}; - class DblBuilder : public VctrBuilder { public: explicit DblBuilder(SEXP ptype_sexp) : VctrBuilder(VECTOR_TYPE_DBL, ptype_sexp) {} diff --git a/r/src/vctr_builder_unspecified.h b/r/src/vctr_builder_unspecified.h new file mode 100644 index 000000000..fd3f5098b --- /dev/null +++ b/r/src/vctr_builder_unspecified.h @@ -0,0 +1,79 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef R_NANOARROW_VCTR_BUILDER_UNSPECIFIED_H_INCLUDED +#define R_NANOARROW_VCTR_BUILDER_UNSPECIFIED_H_INCLUDED + +#define R_NO_REMAP +#include +#include + +#include "vctr_builder_base.h" + +class UnspecifiedBuilder : public VctrBuilder { + public: + explicit UnspecifiedBuilder(SEXP ptype_sexp) + : VctrBuilder(VECTOR_TYPE_UNSPECIFIED, ptype_sexp) {} + + ArrowErrorCode Init(const ArrowSchema* schema, VctrBuilderOptions options, + ArrowError* error) override { + NANOARROW_RETURN_NOT_OK(VctrBuilder::Init(schema, options, error)); + switch (schema_view_.type) { + case NANOARROW_TYPE_DICTIONARY: + StopCantConvert(); + default: + break; + } + + return NANOARROW_OK; + } + + ArrowErrorCode Reserve(R_xlen_t n, ArrowError* error) override { + NANOARROW_RETURN_NOT_OK(VctrBuilder::Reserve(n, error)); + SEXP value = PROTECT(Rf_allocVector(LGLSXP, n)); + SetValue(value); + UNPROTECT(1); + return NANOARROW_OK; + } + + ArrowErrorCode PushNext(const ArrowArray* array, ArrowError* error) override { + int64_t not_null_count; + if (array->null_count == -1 && array->buffers[0] == nullptr) { + not_null_count = array->length; + } else if (array->null_count == -1) { + not_null_count = + ArrowBitCountSet(reinterpret_cast(array->buffers[0]), + array->offset, array->length); + } else { + not_null_count = array->length - array->null_count; + } + + if (not_null_count > 0 && array->length > 0) { + NANOARROW_RETURN_NOT_OK( + WarnLossyConvert("that were non-null set to NA", not_null_count)); + } + + int* value_ptr = LOGICAL(value_) + value_size_; + for (int64_t i = 0; i < array->length; i++) { + value_ptr[i] = NA_LOGICAL; + } + + return NANOARROW_OK; + } +}; + +#endif \ No newline at end of file From 2688e2801d197fe742a72da690e3f1a23641fa23 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 6 Mar 2024 15:27:45 -0400 Subject: [PATCH 21/36] add double impl --- r/src/vctr_builder.cc | 1 + r/src/vctr_builder_dbl.h | 134 +++++++++++++++++++++++++++++++++ r/src/vctr_builder_int.h | 6 -- r/src/vctr_builder_primitive.h | 7 -- 4 files changed, 135 insertions(+), 13 deletions(-) create mode 100644 r/src/vctr_builder_dbl.h diff --git a/r/src/vctr_builder.cc b/r/src/vctr_builder.cc index a0d454517..bfd27068a 100644 --- a/r/src/vctr_builder.cc +++ b/r/src/vctr_builder.cc @@ -29,6 +29,7 @@ #include "vctr_builder.h" #include "vctr_builder_base.h" +#include "vctr_builder_dbl.h" #include "vctr_builder_int.h" #include "vctr_builder_list_of.h" #include "vctr_builder_primitive.h" diff --git a/r/src/vctr_builder_dbl.h b/r/src/vctr_builder_dbl.h new file mode 100644 index 000000000..ae9fab61d --- /dev/null +++ b/r/src/vctr_builder_dbl.h @@ -0,0 +1,134 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef R_NANOARROW_VCTR_BUILDER_DBL_H_INCLUDED +#define R_NANOARROW_VCTR_BUILDER_DBL_H_INCLUDED + +#define R_NO_REMAP +#include +#include + +#include "vctr_builder_base.h" + +// bit64::as.integer64(2^53) +#define MAX_DBL_AS_INTEGER 9007199254740992 + +class DblBuilder : public VctrBuilder { + public: + explicit DblBuilder(SEXP ptype_sexp) : VctrBuilder(VECTOR_TYPE_DBL, ptype_sexp) {} + + SEXP GetPtype() override { return Rf_allocVector(REALSXP, 0); } + + ArrowErrorCode Reserve(R_xlen_t n, ArrowError* error) override { + NANOARROW_RETURN_NOT_OK(VctrBuilder::Reserve(n, error)); + SEXP value = PROTECT(Rf_allocVector(REALSXP, n)); + SetValue(value); + UNPROTECT(1); + return NANOARROW_OK; + } + + ArrowErrorCode PushNext(const ArrowArray* array, ArrowError* error) override { + NANOARROW_RETURN_NOT_OK(VctrBuilder::PushNext(array, error)); + + double* result = REAL(value_); + int64_t n_bad_values = 0; + + // True for all the types supported here + const uint8_t* is_valid = array_view_.buffer_views[0].data.as_uint8; + int64_t raw_src_offset = array_view_.offset; + R_xlen_t length = array_view_.length; + + // Fill the buffer + switch (array_view_.storage_type) { + case NANOARROW_TYPE_NA: + for (R_xlen_t i = 0; i < length; i++) { + result[value_size_ + i] = NA_REAL; + } + break; + case NANOARROW_TYPE_DOUBLE: + memcpy(result + value_size_, + array_view_.buffer_views[1].data.as_double + raw_src_offset, + length * sizeof(double)); + + // Set any nulls to NA_REAL + if (is_valid != NULL && array_view_.null_count != 0) { + for (R_xlen_t i = 0; i < length; i++) { + if (!ArrowBitGet(is_valid, raw_src_offset + i)) { + result[value_size_ + i] = NA_REAL; + } + } + } + break; + case NANOARROW_TYPE_BOOL: + case NANOARROW_TYPE_INT8: + case NANOARROW_TYPE_UINT8: + case NANOARROW_TYPE_INT16: + case NANOARROW_TYPE_UINT16: + case NANOARROW_TYPE_INT32: + case NANOARROW_TYPE_UINT32: + case NANOARROW_TYPE_FLOAT: + // No need to bounds check these types + for (R_xlen_t i = 0; i < length; i++) { + result[value_size_ + i] = ArrowArrayViewGetDoubleUnsafe(&array_view_, i); + } + + // Set any nulls to NA_REAL + if (is_valid != NULL && array_view_.null_count != 0) { + for (R_xlen_t i = 0; i < length; i++) { + if (!ArrowBitGet(is_valid, raw_src_offset + i)) { + result[value_size_ + i] = NA_REAL; + } + } + } + break; + + case NANOARROW_TYPE_INT64: + case NANOARROW_TYPE_UINT64: + for (R_xlen_t i = 0; i < length; i++) { + double value = ArrowArrayViewGetDoubleUnsafe(&array_view_, i); + if (value > MAX_DBL_AS_INTEGER || value < -MAX_DBL_AS_INTEGER) { + // Content of null slot is undefined + n_bad_values += is_valid == NULL || ArrowBitGet(is_valid, raw_src_offset + i); + } + + result[value_size_ + i] = value; + } + + // Set any nulls to NA_REAL + if (is_valid != NULL && array_view_.null_count != 0) { + for (R_xlen_t i = 0; i < length; i++) { + if (!ArrowBitGet(is_valid, raw_src_offset + i)) { + result[value_size_ + i] = NA_REAL; + } + } + } + break; + + default: + return EINVAL; + } + + if (n_bad_values > 0) { + warn_lossy_conversion( + n_bad_values, "may have incurred loss of precision in conversion to double()"); + } + + return NANOARROW_OK; + } +}; + +#endif diff --git a/r/src/vctr_builder_int.h b/r/src/vctr_builder_int.h index 3b67fb3e6..b37d9d6ba 100644 --- a/r/src/vctr_builder_int.h +++ b/r/src/vctr_builder_int.h @@ -28,12 +28,6 @@ class IntBuilder : public VctrBuilder { public: explicit IntBuilder(SEXP ptype_sexp) : VctrBuilder(VECTOR_TYPE_INT, ptype_sexp) {} - ArrowErrorCode Init(const ArrowSchema* schema, VctrBuilderOptions options, - ArrowError* error) override { - NANOARROW_RETURN_NOT_OK(VctrBuilder::Init(schema, options, error)); - return NANOARROW_OK; - } - SEXP GetPtype() override { return Rf_allocVector(INTSXP, 0); } ArrowErrorCode Reserve(R_xlen_t n, ArrowError* error) override { diff --git a/r/src/vctr_builder_primitive.h b/r/src/vctr_builder_primitive.h index 115acde41..712b4be28 100644 --- a/r/src/vctr_builder_primitive.h +++ b/r/src/vctr_builder_primitive.h @@ -24,13 +24,6 @@ #include "vctr_builder_base.h" -class DblBuilder : public VctrBuilder { - public: - explicit DblBuilder(SEXP ptype_sexp) : VctrBuilder(VECTOR_TYPE_DBL, ptype_sexp) {} - - SEXP GetPtype() override { return Rf_allocVector(REALSXP, 0); } -}; - class LglBuilder : public VctrBuilder { public: explicit LglBuilder(SEXP ptype_sexp) : VctrBuilder(VECTOR_TYPE_LGL, ptype_sexp) {} From 24d301b98a8ae66aab813c78178725f83a814492 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 6 Mar 2024 15:36:49 -0400 Subject: [PATCH 22/36] get logical conversion ported --- r/src/vctr_builder.cc | 1 + r/src/vctr_builder_lgl.h | 103 +++++++++++++++++++++++++++++++++ r/src/vctr_builder_primitive.h | 7 --- 3 files changed, 104 insertions(+), 7 deletions(-) create mode 100644 r/src/vctr_builder_lgl.h diff --git a/r/src/vctr_builder.cc b/r/src/vctr_builder.cc index bfd27068a..7b53797fa 100644 --- a/r/src/vctr_builder.cc +++ b/r/src/vctr_builder.cc @@ -30,6 +30,7 @@ #include "vctr_builder.h" #include "vctr_builder_base.h" #include "vctr_builder_dbl.h" +#include "vctr_builder_lgl.h" #include "vctr_builder_int.h" #include "vctr_builder_list_of.h" #include "vctr_builder_primitive.h" diff --git a/r/src/vctr_builder_lgl.h b/r/src/vctr_builder_lgl.h new file mode 100644 index 000000000..ff5dd981d --- /dev/null +++ b/r/src/vctr_builder_lgl.h @@ -0,0 +1,103 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef R_NANOARROW_VCTR_BUILDER_LGL_H_INCLUDED +#define R_NANOARROW_VCTR_BUILDER_LGL_H_INCLUDED + +#define R_NO_REMAP +#include +#include + +#include "vctr_builder_base.h" + +class LglBuilder : public VctrBuilder { + public: + explicit LglBuilder(SEXP ptype_sexp) : VctrBuilder(VECTOR_TYPE_LGL, ptype_sexp) {} + + SEXP GetPtype() override { return Rf_allocVector(LGLSXP, 0); } + + ArrowErrorCode Reserve(R_xlen_t n, ArrowError* error) override { + NANOARROW_RETURN_NOT_OK(VctrBuilder::Reserve(n, error)); + SEXP value = PROTECT(Rf_allocVector(LGLSXP, n)); + SetValue(value); + UNPROTECT(1); + return NANOARROW_OK; + } + + ArrowErrorCode PushNext(const ArrowArray* array, ArrowError* error) override { + NANOARROW_RETURN_NOT_OK(VctrBuilder::PushNext(array, error)); + + // True for all the types supported here + const uint8_t* is_valid = array_view_.buffer_views[0].data.as_uint8; + const uint8_t* data_buffer = array_view_.buffer_views[1].data.as_uint8; + + int64_t raw_src_offset = array_view_.offset; + R_xlen_t length = array->length; + int* result = LOGICAL(value_); + + // Fill the buffer + switch (array_view_.storage_type) { + case NANOARROW_TYPE_NA: + for (R_xlen_t i = 0; i < length; i++) { + result[value_size_ + i] = NA_LOGICAL; + } + break; + case NANOARROW_TYPE_BOOL: + ArrowBitsUnpackInt32(data_buffer, raw_src_offset, length, result + value_size_); + + // Set any nulls to NA_LOGICAL + if (is_valid != NULL && array_view_.null_count != 0) { + for (R_xlen_t i = 0; i < length; i++) { + if (!ArrowBitGet(is_valid, raw_src_offset + i)) { + result[value_size_ + i] = NA_LOGICAL; + } + } + } + break; + case NANOARROW_TYPE_INT8: + case NANOARROW_TYPE_UINT8: + case NANOARROW_TYPE_INT16: + case NANOARROW_TYPE_UINT16: + case NANOARROW_TYPE_INT32: + case NANOARROW_TYPE_UINT32: + case NANOARROW_TYPE_INT64: + case NANOARROW_TYPE_UINT64: + case NANOARROW_TYPE_FLOAT: + case NANOARROW_TYPE_DOUBLE: + for (R_xlen_t i = 0; i < length; i++) { + result[value_size_ + i] = ArrowArrayViewGetIntUnsafe(&array_view_, i) != 0; + } + + // Set any nulls to NA_LOGICAL + if (is_valid != NULL && array_view_.null_count != 0) { + for (R_xlen_t i = 0; i < length; i++) { + if (!ArrowBitGet(is_valid, raw_src_offset + i)) { + result[value_size_ + i] = NA_LOGICAL; + } + } + } + break; + + default: + return EINVAL; + } + + return NANOARROW_OK; + } +}; + +#endif diff --git a/r/src/vctr_builder_primitive.h b/r/src/vctr_builder_primitive.h index 712b4be28..301e894c7 100644 --- a/r/src/vctr_builder_primitive.h +++ b/r/src/vctr_builder_primitive.h @@ -24,13 +24,6 @@ #include "vctr_builder_base.h" -class LglBuilder : public VctrBuilder { - public: - explicit LglBuilder(SEXP ptype_sexp) : VctrBuilder(VECTOR_TYPE_LGL, ptype_sexp) {} - - SEXP GetPtype() override { return Rf_allocVector(LGLSXP, 0); } -}; - class Integer64Builder : public VctrBuilder { public: explicit Integer64Builder(SEXP ptype_sexp) From 1782799f21aabcac2fd9e1b169686ad9778b2417 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 6 Mar 2024 15:46:42 -0400 Subject: [PATCH 23/36] port int64 to new class --- r/src/vctr_builder.cc | 3 +- r/src/vctr_builder_int64.h | 139 +++++++++++++++++++++++++++++++++ r/src/vctr_builder_primitive.h | 6 -- 3 files changed, 141 insertions(+), 7 deletions(-) create mode 100644 r/src/vctr_builder_int64.h diff --git a/r/src/vctr_builder.cc b/r/src/vctr_builder.cc index 7b53797fa..3fb8803dc 100644 --- a/r/src/vctr_builder.cc +++ b/r/src/vctr_builder.cc @@ -30,8 +30,9 @@ #include "vctr_builder.h" #include "vctr_builder_base.h" #include "vctr_builder_dbl.h" -#include "vctr_builder_lgl.h" #include "vctr_builder_int.h" +#include "vctr_builder_int64.h" +#include "vctr_builder_lgl.h" #include "vctr_builder_list_of.h" #include "vctr_builder_primitive.h" #include "vctr_builder_rcrd.h" diff --git a/r/src/vctr_builder_int64.h b/r/src/vctr_builder_int64.h new file mode 100644 index 000000000..41e12c5a2 --- /dev/null +++ b/r/src/vctr_builder_int64.h @@ -0,0 +1,139 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef R_NANOARROW_VCTR_BUILDER_INT64_H_INCLUDED +#define R_NANOARROW_VCTR_BUILDER_INT64_H_INCLUDED + +#define R_NO_REMAP +#include +#include + +#include "vctr_builder_base.h" + +#define NA_INTEGER64 INT64_MIN + +class Integer64Builder : public VctrBuilder { + public: + explicit Integer64Builder(SEXP ptype_sexp) + : VctrBuilder(VECTOR_TYPE_INTEGER64, ptype_sexp) {} + + ArrowErrorCode Reserve(R_xlen_t n, ArrowError* error) override { + NANOARROW_RETURN_NOT_OK(VctrBuilder::Reserve(n, error)); + SEXP value = PROTECT(Rf_allocVector(REALSXP, n)); + SetValue(value); + UNPROTECT(1); + return NANOARROW_OK; + } + + ArrowErrorCode PushNext(const ArrowArray* array, ArrowError* error) override { + NANOARROW_RETURN_NOT_OK(VctrBuilder::PushNext(array, error)); + + int64_t* result = reinterpret_cast(REAL(value_)); + int64_t n_bad_values = 0; + + // True for all the types supported here + const uint8_t* is_valid = array_view_.buffer_views[0].data.as_uint8; + int64_t raw_src_offset = array_view_.offset; + R_xlen_t length = array->length; + + // Fill the buffer + switch (array_view_.storage_type) { + case NANOARROW_TYPE_NA: + for (R_xlen_t i = 0; i < length; i++) { + result[value_size_ + i] = NA_INTEGER64; + } + break; + case NANOARROW_TYPE_INT64: + memcpy(result + value_size_, + array_view_.buffer_views[1].data.as_int32 + raw_src_offset, + length * sizeof(int64_t)); + + // Set any nulls to NA_INTEGER64 + if (is_valid != NULL && array_view_.null_count != 0) { + for (R_xlen_t i = 0; i < length; i++) { + if (!ArrowBitGet(is_valid, raw_src_offset + i)) { + result[value_size_ + i] = NA_INTEGER64; + } + } + } + break; + case NANOARROW_TYPE_BOOL: + case NANOARROW_TYPE_INT8: + case NANOARROW_TYPE_UINT8: + case NANOARROW_TYPE_INT16: + case NANOARROW_TYPE_UINT16: + case NANOARROW_TYPE_INT32: + case NANOARROW_TYPE_UINT32: + // No need to bounds check for these types + for (R_xlen_t i = 0; i < length; i++) { + result[value_size_ + i] = ArrowArrayViewGetIntUnsafe(&array_view_, i); + } + + // Set any nulls to NA_INTEGER + if (is_valid != NULL && array_view_.null_count != 0) { + for (R_xlen_t i = 0; i < length; i++) { + if (!ArrowBitGet(is_valid, raw_src_offset + i)) { + result[value_size_ + i] = NA_INTEGER64; + } + } + } + break; + case NANOARROW_TYPE_UINT64: + case NANOARROW_TYPE_FLOAT: + case NANOARROW_TYPE_DOUBLE: + // Loop + bounds check. Because we don't know what memory might be + // in a null slot, we have to check nulls if there are any. + if (is_valid != NULL && array_view_.null_count != 0) { + for (R_xlen_t i = 0; i < length; i++) { + if (ArrowBitGet(is_valid, raw_src_offset + i)) { + int64_t value = ArrowArrayViewGetIntUnsafe(&array_view_, i); + if (value > INT64_MAX || value <= NA_INTEGER64) { + result[value_size_ + i] = NA_INTEGER64; + n_bad_values++; + } else { + result[value_size_ + i] = value; + } + } else { + result[value_size_ + i] = NA_INTEGER64; + } + } + } else { + for (R_xlen_t i = 0; i < length; i++) { + int64_t value = ArrowArrayViewGetIntUnsafe(&array_view_, i); + if (value > INT64_MAX || value <= NA_INTEGER64) { + result[value_size_ + i] = NA_INTEGER64; + n_bad_values++; + } else { + result[value_size_ + i] = value; + } + } + } + break; + + default: + return EINVAL; + } + + if (n_bad_values > 0) { + warn_lossy_conversion(n_bad_values, "outside integer64 range set to NA"); + } + + return NANOARROW_OK; + } +}; + +#endif diff --git a/r/src/vctr_builder_primitive.h b/r/src/vctr_builder_primitive.h index 301e894c7..3fffdfb92 100644 --- a/r/src/vctr_builder_primitive.h +++ b/r/src/vctr_builder_primitive.h @@ -24,12 +24,6 @@ #include "vctr_builder_base.h" -class Integer64Builder : public VctrBuilder { - public: - explicit Integer64Builder(SEXP ptype_sexp) - : VctrBuilder(VECTOR_TYPE_INTEGER64, ptype_sexp) {} -}; - class ChrBuilder : public VctrBuilder { public: explicit ChrBuilder(SEXP ptype_sexp) From e907d6b22098f5acbb4b4dcf0df220c04c4f14c1 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 6 Mar 2024 15:49:09 -0400 Subject: [PATCH 24/36] fix lossy convert --- r/src/vctr_builder_int64.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/src/vctr_builder_int64.h b/r/src/vctr_builder_int64.h index 41e12c5a2..211e70b76 100644 --- a/r/src/vctr_builder_int64.h +++ b/r/src/vctr_builder_int64.h @@ -129,7 +129,7 @@ class Integer64Builder : public VctrBuilder { } if (n_bad_values > 0) { - warn_lossy_conversion(n_bad_values, "outside integer64 range set to NA"); + WarnLossyConvert("outside integer64 range set to NA", n_bad_values); } return NANOARROW_OK; From 14b9ef54f496e8ec1be3f106e29345fb297584f0 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Wed, 6 Mar 2024 15:49:33 -0400 Subject: [PATCH 25/36] format --- r/src/vctr_builder_unspecified.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/src/vctr_builder_unspecified.h b/r/src/vctr_builder_unspecified.h index fd3f5098b..0556a7c8b 100644 --- a/r/src/vctr_builder_unspecified.h +++ b/r/src/vctr_builder_unspecified.h @@ -76,4 +76,4 @@ class UnspecifiedBuilder : public VctrBuilder { } }; -#endif \ No newline at end of file +#endif From 86a681d2d5afd474e72f12fe354ba427b7113b2a Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Sat, 9 Mar 2024 21:09:18 -0400 Subject: [PATCH 26/36] wire up chr --- r/src/vctr_builder.cc | 1 + r/src/vctr_builder_chr.h | 98 ++++++++++++++++++++++++++++++++++ r/src/vctr_builder_primitive.h | 11 ---- 3 files changed, 99 insertions(+), 11 deletions(-) create mode 100644 r/src/vctr_builder_chr.h diff --git a/r/src/vctr_builder.cc b/r/src/vctr_builder.cc index 3fb8803dc..7d8657102 100644 --- a/r/src/vctr_builder.cc +++ b/r/src/vctr_builder.cc @@ -29,6 +29,7 @@ #include "vctr_builder.h" #include "vctr_builder_base.h" +#include "vctr_builder_chr.h" #include "vctr_builder_dbl.h" #include "vctr_builder_int.h" #include "vctr_builder_int64.h" diff --git a/r/src/vctr_builder_chr.h b/r/src/vctr_builder_chr.h new file mode 100644 index 000000000..a25c88523 --- /dev/null +++ b/r/src/vctr_builder_chr.h @@ -0,0 +1,98 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef R_NANOARROW_VCTR_BUILDER_CHR_H_INCLUDED +#define R_NANOARROW_VCTR_BUILDER_CHR_H_INCLUDED + +#define R_NO_REMAP +#include +#include + +#include +#include + +#include "vctr_builder_base.h" + +class ChrBuilder : public VctrBuilder { + public: + explicit ChrBuilder(SEXP ptype_sexp) : VctrBuilder(VECTOR_TYPE_CHR, ptype_sexp) {} + + SEXP GetPtype() override { return Rf_allocVector(STRSXP, 0); } + + ArrowErrorCode Reserve(R_xlen_t n, ArrowError* error) override { + NANOARROW_RETURN_NOT_OK(VctrBuilder::Reserve(n, error)); + SEXP value = PROTECT(Rf_allocVector(STRSXP, n)); + SetValue(value); + UNPROTECT(1); + return NANOARROW_OK; + } + + ArrowErrorCode PushNext(const ArrowArray* array, ArrowError* error) override { + NANOARROW_RETURN_NOT_OK(VctrBuilder::PushNext(array, error)); + R_xlen_t length = array_view_.length; + + switch (array_view_.storage_type) { + case NANOARROW_TYPE_NA: + for (R_xlen_t i = 0; i < length; i++) { + SET_STRING_ELT(value_, value_size_ + i, NA_STRING); + } + return NANOARROW_OK; + + case NANOARROW_TYPE_INT8: + case NANOARROW_TYPE_UINT8: + case NANOARROW_TYPE_INT16: + case NANOARROW_TYPE_UINT16: + case NANOARROW_TYPE_INT32: + case NANOARROW_TYPE_UINT32: + case NANOARROW_TYPE_INT64: { + char buf[64]; + for (R_xlen_t i = 0; i < length; i++) { + if (ArrowArrayViewIsNull(&array_view_, i)) { + SET_STRING_ELT(value_, value_size_ + i, NA_STRING); + } else { + int n_chars = snprintf(buf, sizeof(buf), "%" PRId64, + ArrowArrayViewGetIntUnsafe(&array_view_, i)); + SET_STRING_ELT(value_, value_size_ + i, + Rf_mkCharLenCE(buf, n_chars, CE_UTF8)); + } + } + return NANOARROW_OK; + } + + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_LARGE_STRING: { + struct ArrowStringView item; + for (R_xlen_t i = 0; i < length; i++) { + if (ArrowArrayViewIsNull(&array_view_, i)) { + SET_STRING_ELT(value_, value_size_ + i, NA_STRING); + } else { + item = ArrowArrayViewGetStringUnsafe(&array_view_, i); + SET_STRING_ELT(value_, value_size_ + i, + Rf_mkCharLenCE(item.data, (int)item.size_bytes, CE_UTF8)); + } + } + + return NANOARROW_OK; + } + + default: + return ENOTSUP; + } + } +}; + +#endif diff --git a/r/src/vctr_builder_primitive.h b/r/src/vctr_builder_primitive.h index 3fffdfb92..b71205619 100644 --- a/r/src/vctr_builder_primitive.h +++ b/r/src/vctr_builder_primitive.h @@ -24,17 +24,6 @@ #include "vctr_builder_base.h" -class ChrBuilder : public VctrBuilder { - public: - explicit ChrBuilder(SEXP ptype_sexp) - : VctrBuilder(VECTOR_TYPE_CHR, ptype_sexp), - use_altrep_(VCTR_BUILDER_USE_ALTREP_DEFAULT) {} - - SEXP GetPtype() override { return Rf_allocVector(STRSXP, 0); } - - VctrBuilderUseAltrep use_altrep_; -}; - class BlobBuilder : public VctrBuilder { public: explicit BlobBuilder(SEXP ptype_sexp) : VctrBuilder(VECTOR_TYPE_BLOB, ptype_sexp) {} From c9b87206090524b92d23fa69c8ff1ce7a2547013 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Sat, 9 Mar 2024 21:41:53 -0400 Subject: [PATCH 27/36] wire up blob --- r/src/vctr_builder.cc | 5 ++- r/src/vctr_builder_blob.h | 73 ++++++++++++++++++++++++++++++++++ r/src/vctr_builder_primitive.h | 5 --- 3 files changed, 76 insertions(+), 7 deletions(-) create mode 100644 r/src/vctr_builder_blob.h diff --git a/r/src/vctr_builder.cc b/r/src/vctr_builder.cc index 7d8657102..abc2399aa 100644 --- a/r/src/vctr_builder.cc +++ b/r/src/vctr_builder.cc @@ -29,6 +29,7 @@ #include "vctr_builder.h" #include "vctr_builder_base.h" +#include "vctr_builder_blob.h" #include "vctr_builder_chr.h" #include "vctr_builder_dbl.h" #include "vctr_builder_int.h" @@ -195,10 +196,10 @@ ArrowErrorCode InstantiateBuilder(const ArrowSchema* schema, SEXP ptype_sexp, vector_type = VECTOR_TYPE_DATA_FRAME; } else if (Rf_inherits(ptype_sexp, "vctrs_unspecified")) { vector_type = VECTOR_TYPE_UNSPECIFIED; - } else if (Rf_inherits(ptype_sexp, "vctrs_list_of")) { - vector_type = VECTOR_TYPE_LIST_OF; } else if (Rf_inherits(ptype_sexp, "blob")) { vector_type = VECTOR_TYPE_BLOB; + } else if (Rf_inherits(ptype_sexp, "vctrs_list_of")) { + vector_type = VECTOR_TYPE_LIST_OF; } else if (Rf_inherits(ptype_sexp, "Date")) { vector_type = VECTOR_TYPE_DATE; } else if (Rf_inherits(ptype_sexp, "hms")) { diff --git a/r/src/vctr_builder_blob.h b/r/src/vctr_builder_blob.h new file mode 100644 index 000000000..c6ba54260 --- /dev/null +++ b/r/src/vctr_builder_blob.h @@ -0,0 +1,73 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef R_NANOARROW_VCTR_BUILDER_BLOB_H_INCLUDED +#define R_NANOARROW_VCTR_BUILDER_BLOB_H_INCLUDED + +#define R_NO_REMAP +#include +#include + +#include "vctr_builder_base.h" + +class BlobBuilder : public VctrBuilder { + public: + explicit BlobBuilder(SEXP ptype_sexp) : VctrBuilder(VECTOR_TYPE_BLOB, ptype_sexp) {} + + ArrowErrorCode Reserve(R_xlen_t n, ArrowError* error) override { + NANOARROW_RETURN_NOT_OK(VctrBuilder::Reserve(n, error)); + SEXP value = PROTECT(Rf_allocVector(VECSXP, n)); + SetValue(value); + UNPROTECT(1); + return NANOARROW_OK; + } + + ArrowErrorCode PushNext(const ArrowArray* array, ArrowError* error) override { + NANOARROW_RETURN_NOT_OK(VctrBuilder::PushNext(array, error)); + R_xlen_t length = array_view_.length; + + switch (array_view_.storage_type) { + case NANOARROW_TYPE_NA: + // Works because lists are filled with R_NilValue by default + // when allocated. + return NANOARROW_OK; + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_LARGE_STRING: + case NANOARROW_TYPE_BINARY: + case NANOARROW_TYPE_LARGE_BINARY: + break; + default: + return ENOTSUP; + } + + struct ArrowBufferView item; + SEXP item_sexp; + for (R_xlen_t i = 0; i < length; i++) { + if (!ArrowArrayViewIsNull(&array_view_, i)) { + item = ArrowArrayViewGetBytesUnsafe(&array_view_, i); + item_sexp = PROTECT(Rf_allocVector(RAWSXP, item.size_bytes)); + memcpy(RAW(item_sexp), item.data.data, item.size_bytes); + SET_VECTOR_ELT(value_, value_size_ + i, item_sexp); + UNPROTECT(1); + } + } + + return NANOARROW_OK; + } +}; + +#endif diff --git a/r/src/vctr_builder_primitive.h b/r/src/vctr_builder_primitive.h index b71205619..d9b290461 100644 --- a/r/src/vctr_builder_primitive.h +++ b/r/src/vctr_builder_primitive.h @@ -24,11 +24,6 @@ #include "vctr_builder_base.h" -class BlobBuilder : public VctrBuilder { - public: - explicit BlobBuilder(SEXP ptype_sexp) : VctrBuilder(VECTOR_TYPE_BLOB, ptype_sexp) {} -}; - class DateBuilder : public VctrBuilder { public: explicit DateBuilder(SEXP ptype_sexp) : VctrBuilder(VECTOR_TYPE_DATE, ptype_sexp) {} From cb88afe3f7688a60199a29970935250721a2dd66 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Sat, 9 Mar 2024 22:21:36 -0400 Subject: [PATCH 28/36] wire up date converter --- r/src/vctr_builder.cc | 1 + r/src/vctr_builder_date.h | 46 ++++++++++++++++++++++++++++++++++ r/src/vctr_builder_dbl.h | 7 +++--- r/src/vctr_builder_primitive.h | 5 ---- 4 files changed, 51 insertions(+), 8 deletions(-) create mode 100644 r/src/vctr_builder_date.h diff --git a/r/src/vctr_builder.cc b/r/src/vctr_builder.cc index abc2399aa..3839c8f90 100644 --- a/r/src/vctr_builder.cc +++ b/r/src/vctr_builder.cc @@ -31,6 +31,7 @@ #include "vctr_builder_base.h" #include "vctr_builder_blob.h" #include "vctr_builder_chr.h" +#include "vctr_builder_date.h" #include "vctr_builder_dbl.h" #include "vctr_builder_int.h" #include "vctr_builder_int64.h" diff --git a/r/src/vctr_builder_date.h b/r/src/vctr_builder_date.h new file mode 100644 index 000000000..65ebf265f --- /dev/null +++ b/r/src/vctr_builder_date.h @@ -0,0 +1,46 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef R_NANOARROW_VCTR_BUILDER_DATE_H_INCLUDED +#define R_NANOARROW_VCTR_BUILDER_DATE_H_INCLUDED + +#define R_NO_REMAP +#include +#include + +#include "vctr_builder_dbl.h" + +class DateBuilder : public DblBuilder { + public: + explicit DateBuilder(SEXP ptype_sexp) : DblBuilder(ptype_sexp, VECTOR_TYPE_DATE) {} + + ArrowErrorCode Init(const ArrowSchema* schema, VctrBuilderOptions options, + ArrowError* error) { + NANOARROW_RETURN_NOT_OK(DblBuilder::Init(schema, options, error)); + switch (schema_view_.type) { + case NANOARROW_TYPE_NA: + case NANOARROW_TYPE_DATE32: + break; + default: + StopCantConvert(); + } + + return NANOARROW_OK; + } +}; + +#endif diff --git a/r/src/vctr_builder_dbl.h b/r/src/vctr_builder_dbl.h index ae9fab61d..29985b240 100644 --- a/r/src/vctr_builder_dbl.h +++ b/r/src/vctr_builder_dbl.h @@ -29,7 +29,8 @@ class DblBuilder : public VctrBuilder { public: - explicit DblBuilder(SEXP ptype_sexp) : VctrBuilder(VECTOR_TYPE_DBL, ptype_sexp) {} + explicit DblBuilder(SEXP ptype_sexp, VectorType vector_type = VECTOR_TYPE_DBL) + : VctrBuilder(vector_type, ptype_sexp) {} SEXP GetPtype() override { return Rf_allocVector(REALSXP, 0); } @@ -123,8 +124,8 @@ class DblBuilder : public VctrBuilder { } if (n_bad_values > 0) { - warn_lossy_conversion( - n_bad_values, "may have incurred loss of precision in conversion to double()"); + WarnLossyConvert("may have incurred loss of precision in conversion to double()", + n_bad_values); } return NANOARROW_OK; diff --git a/r/src/vctr_builder_primitive.h b/r/src/vctr_builder_primitive.h index d9b290461..7b7996263 100644 --- a/r/src/vctr_builder_primitive.h +++ b/r/src/vctr_builder_primitive.h @@ -24,11 +24,6 @@ #include "vctr_builder_base.h" -class DateBuilder : public VctrBuilder { - public: - explicit DateBuilder(SEXP ptype_sexp) : VctrBuilder(VECTOR_TYPE_DATE, ptype_sexp) {} -}; - class HmsBuilder : public VctrBuilder { public: explicit HmsBuilder(SEXP ptype_sexp) : VctrBuilder(VECTOR_TYPE_HMS, ptype_sexp) {} From f439865b12d9884ea5975e1115639dfbe50af6a9 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Sat, 9 Mar 2024 23:29:59 -0400 Subject: [PATCH 29/36] wire up more converters --- r/src/vctr_builder.cc | 2 + r/src/vctr_builder_dbl.h | 2 +- r/src/vctr_builder_difftime.h | 125 +++++++++++++++++++++++++++++++++ r/src/vctr_builder_hms.h | 47 +++++++++++++ r/src/vctr_builder_primitive.h | 11 --- 5 files changed, 175 insertions(+), 12 deletions(-) create mode 100644 r/src/vctr_builder_difftime.h create mode 100644 r/src/vctr_builder_hms.h diff --git a/r/src/vctr_builder.cc b/r/src/vctr_builder.cc index 3839c8f90..3873426e4 100644 --- a/r/src/vctr_builder.cc +++ b/r/src/vctr_builder.cc @@ -33,6 +33,8 @@ #include "vctr_builder_chr.h" #include "vctr_builder_date.h" #include "vctr_builder_dbl.h" +#include "vctr_builder_difftime.h" +#include "vctr_builder_hms.h" #include "vctr_builder_int.h" #include "vctr_builder_int64.h" #include "vctr_builder_lgl.h" diff --git a/r/src/vctr_builder_dbl.h b/r/src/vctr_builder_dbl.h index 29985b240..c31114775 100644 --- a/r/src/vctr_builder_dbl.h +++ b/r/src/vctr_builder_dbl.h @@ -42,7 +42,7 @@ class DblBuilder : public VctrBuilder { return NANOARROW_OK; } - ArrowErrorCode PushNext(const ArrowArray* array, ArrowError* error) override { + virtual ArrowErrorCode PushNext(const ArrowArray* array, ArrowError* error) override { NANOARROW_RETURN_NOT_OK(VctrBuilder::PushNext(array, error)); double* result = REAL(value_); diff --git a/r/src/vctr_builder_difftime.h b/r/src/vctr_builder_difftime.h new file mode 100644 index 000000000..18dd5a659 --- /dev/null +++ b/r/src/vctr_builder_difftime.h @@ -0,0 +1,125 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef R_NANOARROW_VCTR_BUILDER_DIFFTIME_H_INCLUDED +#define R_NANOARROW_VCTR_BUILDER_DIFFTIME_H_INCLUDED + +#define R_NO_REMAP +#include +#include + +#include "vctr_builder_dbl.h" + +class DifftimeBuilder : public DblBuilder { + public: + explicit DifftimeBuilder(SEXP ptype_sexp, VectorType vector_type = VECTOR_TYPE_DIFFTIME) + : DblBuilder(ptype_sexp, vector_type), scale_(0) {} + + ArrowErrorCode Init(const ArrowSchema* schema, VctrBuilderOptions options, + ArrowError* error) override { + NANOARROW_RETURN_NOT_OK(DblBuilder::Init(schema, options, error)); + switch (schema_view_.type) { + case NANOARROW_TYPE_NA: + case NANOARROW_TYPE_TIME32: + case NANOARROW_TYPE_TIME64: + case NANOARROW_TYPE_DURATION: + break; + default: + StopCantConvert(); + } + + switch (GetTimeUnits(ptype_sexp_)) { + case R_TIME_UNIT_MINUTES: + scale_ = 1.0 / 60; + break; + case R_TIME_UNIT_HOURS: + scale_ = 1.0 / (60 * 60); + break; + case R_TIME_UNIT_DAYS: + scale_ = 1.0 / (60 * 60 * 24); + break; + case R_TIME_UNIT_WEEKS: + scale_ = 1.0 / (60 * 60 * 24 * 7); + break; + default: + scale_ = 1.0; + break; + } + + switch (schema_view_.time_unit) { + case NANOARROW_TIME_UNIT_SECOND: + scale_ *= 1; + break; + case NANOARROW_TIME_UNIT_MILLI: + scale_ *= 1e-3; + break; + case NANOARROW_TIME_UNIT_MICRO: + scale_ *= 1e-6; + break; + case NANOARROW_TIME_UNIT_NANO: + scale_ *= 1e-9; + break; + default: + return EINVAL; + } + + return NANOARROW_OK; + } + + virtual ArrowErrorCode PushNext(const ArrowArray* array, ArrowError* error) override { + R_xlen_t value_size0 = value_size_; + NANOARROW_RETURN_NOT_OK(DblBuilder::PushNext(array, error)); + + if (scale_ != 1) { + double* result = REAL(value_); + for (int64_t i = 0; i < array_view_.length; i++) { + result[value_size0 + i] = result[value_size0 + i] * scale_; + } + } + + return NANOARROW_OK; + } + + private: + double scale_; + + static RTimeUnits GetTimeUnits(SEXP ptype) { + SEXP units_attr = Rf_getAttrib(ptype, Rf_install("units")); + if (units_attr == R_NilValue || TYPEOF(units_attr) != STRSXP || + Rf_length(units_attr) != 1) { + Rf_error("Expected difftime 'units' attribute of type character(1)"); + } + + const char* dst_units = Rf_translateCharUTF8(STRING_ELT(units_attr, 0)); + if (strcmp(dst_units, "secs") == 0) { + return R_TIME_UNIT_SECONDS; + } else if (strcmp(dst_units, "mins") == 0) { + return R_TIME_UNIT_MINUTES; + } else if (strcmp(dst_units, "hours") == 0) { + return R_TIME_UNIT_HOURS; + } else if (strcmp(dst_units, "days") == 0) { + return R_TIME_UNIT_DAYS; + } else if (strcmp(dst_units, "weeks") == 0) { + return R_TIME_UNIT_WEEKS; + } else { + Rf_error("Unexpected value for difftime 'units' attribute"); + return R_TIME_UNIT_SECONDS; + } + } +}; + +#endif diff --git a/r/src/vctr_builder_hms.h b/r/src/vctr_builder_hms.h new file mode 100644 index 000000000..bd2052f31 --- /dev/null +++ b/r/src/vctr_builder_hms.h @@ -0,0 +1,47 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef R_NANOARROW_VCTR_BUILDER_HMS_H_INCLUDED +#define R_NANOARROW_VCTR_BUILDER_HMS_H_INCLUDED + +#define R_NO_REMAP +#include +#include + +#include "vctr_builder_difftime.h" + +class HmsBuilder : public DifftimeBuilder { + public: + explicit HmsBuilder(SEXP ptype_sexp) : DifftimeBuilder(ptype_sexp, VECTOR_TYPE_HMS) {} + + ArrowErrorCode Init(const ArrowSchema* schema, VctrBuilderOptions options, + ArrowError* error) { + NANOARROW_RETURN_NOT_OK(DifftimeBuilder::Init(schema, options, error)); + switch (schema_view_.type) { + case NANOARROW_TYPE_NA: + case NANOARROW_TYPE_TIME32: + case NANOARROW_TYPE_TIME64: + break; + default: + StopCantConvert(); + } + + return NANOARROW_OK; + } +}; + +#endif diff --git a/r/src/vctr_builder_primitive.h b/r/src/vctr_builder_primitive.h index 7b7996263..8d4bc5fc6 100644 --- a/r/src/vctr_builder_primitive.h +++ b/r/src/vctr_builder_primitive.h @@ -24,23 +24,12 @@ #include "vctr_builder_base.h" -class HmsBuilder : public VctrBuilder { - public: - explicit HmsBuilder(SEXP ptype_sexp) : VctrBuilder(VECTOR_TYPE_HMS, ptype_sexp) {} -}; - class PosixctBuilder : public VctrBuilder { public: explicit PosixctBuilder(SEXP ptype_sexp) : VctrBuilder(VECTOR_TYPE_POSIXCT, ptype_sexp) {} }; -class DifftimeBuilder : public VctrBuilder { - public: - explicit DifftimeBuilder(SEXP ptype_sexp) - : VctrBuilder(VECTOR_TYPE_DIFFTIME, ptype_sexp) {} -}; - class OtherBuilder : public VctrBuilder { public: explicit OtherBuilder(SEXP ptype_sexp) : VctrBuilder(VECTOR_TYPE_OTHER, ptype_sexp) {} From 90c8a0d7f1c0662bee76af0d1b10052673ddf5d5 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Sat, 9 Mar 2024 23:43:44 -0400 Subject: [PATCH 30/36] wire up posixct --- r/src/vctr_builder.cc | 1 + r/src/vctr_builder_difftime.h | 2 +- r/src/vctr_builder_posixct.h | 90 ++++++++++++++++++++++++++++++++++ r/src/vctr_builder_primitive.h | 6 --- 4 files changed, 92 insertions(+), 7 deletions(-) create mode 100644 r/src/vctr_builder_posixct.h diff --git a/r/src/vctr_builder.cc b/r/src/vctr_builder.cc index 3873426e4..2edd03429 100644 --- a/r/src/vctr_builder.cc +++ b/r/src/vctr_builder.cc @@ -39,6 +39,7 @@ #include "vctr_builder_int64.h" #include "vctr_builder_lgl.h" #include "vctr_builder_list_of.h" +#include "vctr_builder_posixct.h" #include "vctr_builder_primitive.h" #include "vctr_builder_rcrd.h" #include "vctr_builder_unspecified.h" diff --git a/r/src/vctr_builder_difftime.h b/r/src/vctr_builder_difftime.h index 18dd5a659..6c6d00b42 100644 --- a/r/src/vctr_builder_difftime.h +++ b/r/src/vctr_builder_difftime.h @@ -80,7 +80,7 @@ class DifftimeBuilder : public DblBuilder { return NANOARROW_OK; } - virtual ArrowErrorCode PushNext(const ArrowArray* array, ArrowError* error) override { + ArrowErrorCode PushNext(const ArrowArray* array, ArrowError* error) override { R_xlen_t value_size0 = value_size_; NANOARROW_RETURN_NOT_OK(DblBuilder::PushNext(array, error)); diff --git a/r/src/vctr_builder_posixct.h b/r/src/vctr_builder_posixct.h new file mode 100644 index 000000000..ee0a38ccd --- /dev/null +++ b/r/src/vctr_builder_posixct.h @@ -0,0 +1,90 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef R_NANOARROW_VCTR_BUILDER_POSIXCT_H_INCLUDED +#define R_NANOARROW_VCTR_BUILDER_POSIXCT_H_INCLUDED + +#define R_NO_REMAP +#include +#include + +#include "vctr_builder_dbl.h" + +class PosixctBuilder : public DblBuilder { + public: + explicit PosixctBuilder(SEXP ptype_sexp) + : DblBuilder(ptype_sexp, VECTOR_TYPE_POSIXCT), scale_(0) {} + + ArrowErrorCode Init(const ArrowSchema* schema, VctrBuilderOptions options, + ArrowError* error) override { + NANOARROW_RETURN_NOT_OK(DblBuilder::Init(schema, options, error)); + + ArrowTimeUnit time_unit = NANOARROW_TIME_UNIT_SECOND; + switch (schema_view_.type) { + case NANOARROW_TYPE_NA: + break; + case NANOARROW_TYPE_DATE64: + time_unit = NANOARROW_TIME_UNIT_MILLI; + break; + case NANOARROW_TYPE_TIMESTAMP: + time_unit = schema_view_.time_unit; + break; + default: + StopCantConvert(); + } + + scale_ = 1; + + switch (time_unit) { + case NANOARROW_TIME_UNIT_SECOND: + scale_ *= 1; + break; + case NANOARROW_TIME_UNIT_MILLI: + scale_ *= 1e-3; + break; + case NANOARROW_TIME_UNIT_MICRO: + scale_ *= 1e-6; + break; + case NANOARROW_TIME_UNIT_NANO: + scale_ *= 1e-9; + break; + default: + return EINVAL; + } + + return NANOARROW_OK; + } + + ArrowErrorCode PushNext(const ArrowArray* array, ArrowError* error) override { + R_xlen_t value_size0 = value_size_; + NANOARROW_RETURN_NOT_OK(DblBuilder::PushNext(array, error)); + + if (scale_ != 1) { + double* result = REAL(value_); + for (int64_t i = 0; i < array_view_.length; i++) { + result[value_size0 + i] = result[value_size0 + i] * scale_; + } + } + + return NANOARROW_OK; + } + + private: + double scale_; +}; + +#endif diff --git a/r/src/vctr_builder_primitive.h b/r/src/vctr_builder_primitive.h index 8d4bc5fc6..6f72ce712 100644 --- a/r/src/vctr_builder_primitive.h +++ b/r/src/vctr_builder_primitive.h @@ -24,12 +24,6 @@ #include "vctr_builder_base.h" -class PosixctBuilder : public VctrBuilder { - public: - explicit PosixctBuilder(SEXP ptype_sexp) - : VctrBuilder(VECTOR_TYPE_POSIXCT, ptype_sexp) {} -}; - class OtherBuilder : public VctrBuilder { public: explicit OtherBuilder(SEXP ptype_sexp) : VctrBuilder(VECTOR_TYPE_OTHER, ptype_sexp) {} From dcd08a9353457d288312b3d0f4a34fd1b5213e7d Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 14 Mar 2024 17:56:13 -0300 Subject: [PATCH 31/36] start on the call-into-r --- r/src/vctr_builder.cc | 2 +- r/src/{vctr_builder_primitive.h => vctr_builder_other.h} | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) rename r/src/{vctr_builder_primitive.h => vctr_builder_other.h} (90%) diff --git a/r/src/vctr_builder.cc b/r/src/vctr_builder.cc index 2edd03429..f9dae98e3 100644 --- a/r/src/vctr_builder.cc +++ b/r/src/vctr_builder.cc @@ -39,8 +39,8 @@ #include "vctr_builder_int64.h" #include "vctr_builder_lgl.h" #include "vctr_builder_list_of.h" +#include "vctr_builder_other.h" #include "vctr_builder_posixct.h" -#include "vctr_builder_primitive.h" #include "vctr_builder_rcrd.h" #include "vctr_builder_unspecified.h" diff --git a/r/src/vctr_builder_primitive.h b/r/src/vctr_builder_other.h similarity index 90% rename from r/src/vctr_builder_primitive.h rename to r/src/vctr_builder_other.h index 6f72ce712..e0d2c9871 100644 --- a/r/src/vctr_builder_primitive.h +++ b/r/src/vctr_builder_other.h @@ -15,8 +15,8 @@ // specific language governing permissions and limitations // under the License. -#ifndef R_NANOARROW_VCTR_BUILDER_PRIMITIVE_H_INCLUDED -#define R_NANOARROW_VCTR_BUILDER_PRIMITIVE_H_INCLUDED +#ifndef R_NANOARROW_VCTR_BUILDER_OTHER_H_INCLUDED +#define R_NANOARROW_VCTR_BUILDER_OTHER_H_INCLUDED #define R_NO_REMAP #include From a5a9b6fa525621aac4df199000ec9cf60ed60d48 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 14 Mar 2024 21:51:59 -0300 Subject: [PATCH 32/36] always route extension types and dictionaires through the Other route --- r/src/vctr_builder.cc | 42 ++++++++++++++++++++++++---------------- r/src/vctr_builder_dbl.h | 8 +++++++- 2 files changed, 32 insertions(+), 18 deletions(-) diff --git a/r/src/vctr_builder.cc b/r/src/vctr_builder.cc index f9dae98e3..0e36c2962 100644 --- a/r/src/vctr_builder.cc +++ b/r/src/vctr_builder.cc @@ -157,24 +157,32 @@ static ArrowErrorCode InstantiateBuilderBase(const ArrowSchema* schema, ArrowErrorCode InstantiateBuilder(const ArrowSchema* schema, SEXP ptype_sexp, VctrBuilderOptions options, VctrBuilder** out, ArrowError* error) { - // See if we can skip any ptype resolution at all + ArrowSchemaView view; + NANOARROW_RETURN_NOT_OK(ArrowSchemaViewInit(&view, schema, error)); + + // Extension types and dictionary types always need their ptype resolved in + // R and always need to use the VctrBuilderOther. This simplifies writing + // the builders (e.g., they do not all have to consider these cases). + if (view.extension_name.size_bytes > 0 || view.type == NANOARROW_TYPE_DICTIONARY) { + SEXP inferred_ptype_sexp = PROTECT(call_infer_ptype_other(schema)); + int code = InstantiateBuilderBase(schema, VECTOR_TYPE_OTHER, inferred_ptype_sexp, out, + error); + UNPROTECT(1); + return code; + } + if (ptype_sexp == R_NilValue) { - ArrowSchemaView view; - NANOARROW_RETURN_NOT_OK(ArrowSchemaViewInit(&view, schema, error)); - - // Ensure extension types always go through infer_ptype_other() - if (view.extension_name.size_bytes == 0) { - enum VectorType vector_type = nanoarrow_infer_vector_type(view.type); - switch (vector_type) { - case VECTOR_TYPE_LGL: - case VECTOR_TYPE_INT: - case VECTOR_TYPE_DBL: - case VECTOR_TYPE_CHR: - case VECTOR_TYPE_DATA_FRAME: - return InstantiateBuilderBase(schema, vector_type, R_NilValue, out, error); - default: - break; - } + // See if we can skip any ptype resolution at all + enum VectorType vector_type = nanoarrow_infer_vector_type(view.type); + switch (vector_type) { + case VECTOR_TYPE_LGL: + case VECTOR_TYPE_INT: + case VECTOR_TYPE_DBL: + case VECTOR_TYPE_CHR: + case VECTOR_TYPE_DATA_FRAME: + return InstantiateBuilderBase(schema, vector_type, R_NilValue, out, error); + default: + break; } // Otherwise, resolve the ptype and use it (this will error for ptypes that can't be diff --git a/r/src/vctr_builder_dbl.h b/r/src/vctr_builder_dbl.h index c31114775..9d1419ba2 100644 --- a/r/src/vctr_builder_dbl.h +++ b/r/src/vctr_builder_dbl.h @@ -32,7 +32,13 @@ class DblBuilder : public VctrBuilder { explicit DblBuilder(SEXP ptype_sexp, VectorType vector_type = VECTOR_TYPE_DBL) : VctrBuilder(vector_type, ptype_sexp) {} - SEXP GetPtype() override { return Rf_allocVector(REALSXP, 0); } + SEXP GetPtype() override { + if (ptype_sexp_ != R_NilValue) { + return ptype_sexp_; + } else { + return Rf_allocVector(REALSXP, 0); + } + } ArrowErrorCode Reserve(R_xlen_t n, ArrowError* error) override { NANOARROW_RETURN_NOT_OK(VctrBuilder::Reserve(n, error)); From 25b7b884f9456837236e9dba99ffe4998a3c4011 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 14 Mar 2024 23:22:19 -0300 Subject: [PATCH 33/36] sketch "other" support --- r/src/vctr_builder_other.h | 83 +++++++++++++++++++++++++++++++++++++- 1 file changed, 82 insertions(+), 1 deletion(-) diff --git a/r/src/vctr_builder_other.h b/r/src/vctr_builder_other.h index e0d2c9871..74f69dac9 100644 --- a/r/src/vctr_builder_other.h +++ b/r/src/vctr_builder_other.h @@ -24,9 +24,90 @@ #include "vctr_builder_base.h" +// If we've ended up here, we need to call in to R to convert this stream +// of arrays into an R vector. Currently, the S3 generic that implements +// this is convert_array(), so we have to do this one array at a time. +// The current conversions that are implemented this way internally are +// factor(), decimal, and + extension types/dictionary. +// +// An early version of this reimplemented a good chunk of vctrs-like internals +// to allow a generic preallocate where each chunk would be copied in to the +// preallocated vector. This version just converts each chunk as it comes +// and calls c(); however, eventually the generic should be +// convert_array_stream() to give implementations in other packages the ability +// to handle converting more than one array at a time. class OtherBuilder : public VctrBuilder { public: - explicit OtherBuilder(SEXP ptype_sexp) : VctrBuilder(VECTOR_TYPE_OTHER, ptype_sexp) {} + explicit OtherBuilder(SEXP ptype_sexp) + : VctrBuilder(VECTOR_TYPE_OTHER, ptype_sexp), + chunks_sexp_(R_NilValue), + chunks_tail_(R_NilValue) {} + + ~OtherBuilder() { nanoarrow_release_sexp(chunks_sexp_); } + + ArrowErrorCode Reserve(R_xlen_t n, ArrowError* error) override { return NANOARROW_OK; } + + ArrowErrorCode PushNext(const ArrowArray* array, ArrowError* error) override { + // Fill this in + return NANOARROW_OK; + } + + ArrowErrorCode Finish(ArrowError* error) override { + if (chunks_tail_ == chunks_sexp_) { + Rprintf("zero chunks\n"); + // Zero chunks (return the ptype) + // Probably need to ensure the ptype has zero elements + SetValue(GetPtype()); + + } else if (chunks_tail_ == CDR(chunks_sexp_)) { + Rprintf("one chunk\n"); + // One chunk (return the chunk) + SetValue(CAR(chunks_tail_)); + + } else { + Rprintf("many chunks\n"); + // Many chunks (concatenate or rbind) + SEXP fun; + if (Rf_inherits(ptype_sexp_, "data.frame")) { + fun = PROTECT(Rf_install("rbind")); + } else { + fun = PROTECT(Rf_install("c")); + } + + SETCAR(chunks_sexp_, fun); + UNPROTECT(1); + + SEXP result = PROTECT(Rf_eval(chunks_sexp_, R_BaseEnv)); + SetValue(result); + UNPROTECT(1); + } + + nanoarrow_release_sexp(chunks_sexp_); + chunks_sexp_ = R_NilValue; + chunks_tail_ = R_NilValue; + return NANOARROW_OK; + } + + private: + SEXP chunks_sexp_; + SEXP chunks_tail_; + + void Append(SEXP chunk_sexp) { + if (chunks_sexp_ == R_NilValue) { + // Not sure if we will need no function, c, or rbind when we + // create this, so leave it as R_NilValue for now. + SEXP chunks_init = PROTECT(Rf_lang1(R_NilValue)); + chunks_sexp_ = chunks_init; + nanoarrow_preserve_sexp(chunks_sexp_); + chunks_tail_ = chunks_sexp_; + UNPROTECT(1); + } + + SEXP next_sexp = PROTECT(Rf_lcons(chunk_sexp, R_NilValue)); + SETCDR(chunks_tail_, next_sexp); + UNPROTECT(1); + chunks_tail_ = next_sexp; + } }; #endif From 873fd8e4fb89c0c969d5c1bdb46c866d071b8278 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 14 Mar 2024 23:29:12 -0300 Subject: [PATCH 34/36] pass optional ownership info --- r/src/vctr_builder.cc | 2 +- r/src/vctr_builder_base.h | 5 +++-- r/src/vctr_builder_blob.h | 5 +++-- r/src/vctr_builder_chr.h | 5 +++-- r/src/vctr_builder_dbl.h | 5 +++-- r/src/vctr_builder_difftime.h | 5 +++-- r/src/vctr_builder_int.h | 5 +++-- r/src/vctr_builder_int64.h | 5 +++-- r/src/vctr_builder_lgl.h | 5 +++-- r/src/vctr_builder_other.h | 3 ++- r/src/vctr_builder_posixct.h | 5 +++-- r/src/vctr_builder_unspecified.h | 3 ++- 12 files changed, 32 insertions(+), 21 deletions(-) diff --git a/r/src/vctr_builder.cc b/r/src/vctr_builder.cc index 0e36c2962..52b1dfb21 100644 --- a/r/src/vctr_builder.cc +++ b/r/src/vctr_builder.cc @@ -311,7 +311,7 @@ SEXP nanoarrow_c_convert_array2(SEXP array_xptr, SEXP ptype_sexp) { Rf_error("builder->Reserve() failed: %s", error.message); } - result = builder->PushNext(array, &error); + result = builder->PushNext(array_xptr, array, &error); if (result != NANOARROW_OK) { Rf_error("builder->PushNext() failed: %s", error.message); } diff --git a/r/src/vctr_builder_base.h b/r/src/vctr_builder_base.h index 9bd64c27f..02597f5a4 100644 --- a/r/src/vctr_builder_base.h +++ b/r/src/vctr_builder_base.h @@ -76,7 +76,8 @@ struct VctrBuilder { // Push an array into this builder and do not take ownership of array. This is // called when the caller cannot safely relinquish ownership of an array (e.g., // convert_array()). Calling this method may longjmp. - virtual ArrowErrorCode PushNext(const ArrowArray* array, ArrowError* error) { + virtual ArrowErrorCode PushNext(SEXP array_shelter, const ArrowArray* array, + ArrowError* error) { NANOARROW_RETURN_NOT_OK(ArrowArrayViewSetArray(&array_view_, array, error)); return NANOARROW_OK; } @@ -85,7 +86,7 @@ struct VctrBuilder { // ownership. This is called when the caller can relinquish ownership (e.g., // convert_array_stream()). Calling this method may longjmp. virtual ArrowErrorCode PushNextOwning(ArrowArray* array, ArrowError* error) { - return PushNext(array, error); + return PushNext(R_NilValue, array, error); } // Perform any final calculations required to calculate the return value. diff --git a/r/src/vctr_builder_blob.h b/r/src/vctr_builder_blob.h index c6ba54260..a47edf180 100644 --- a/r/src/vctr_builder_blob.h +++ b/r/src/vctr_builder_blob.h @@ -36,8 +36,9 @@ class BlobBuilder : public VctrBuilder { return NANOARROW_OK; } - ArrowErrorCode PushNext(const ArrowArray* array, ArrowError* error) override { - NANOARROW_RETURN_NOT_OK(VctrBuilder::PushNext(array, error)); + ArrowErrorCode PushNext(SEXP array_shelter, const ArrowArray* array, + ArrowError* error) override { + NANOARROW_RETURN_NOT_OK(VctrBuilder::PushNext(array_shelter, array, error)); R_xlen_t length = array_view_.length; switch (array_view_.storage_type) { diff --git a/r/src/vctr_builder_chr.h b/r/src/vctr_builder_chr.h index a25c88523..4b51069e4 100644 --- a/r/src/vctr_builder_chr.h +++ b/r/src/vctr_builder_chr.h @@ -41,8 +41,9 @@ class ChrBuilder : public VctrBuilder { return NANOARROW_OK; } - ArrowErrorCode PushNext(const ArrowArray* array, ArrowError* error) override { - NANOARROW_RETURN_NOT_OK(VctrBuilder::PushNext(array, error)); + ArrowErrorCode PushNext(SEXP array_shelter, const ArrowArray* array, + ArrowError* error) override { + NANOARROW_RETURN_NOT_OK(VctrBuilder::PushNext(array_shelter, array, error)); R_xlen_t length = array_view_.length; switch (array_view_.storage_type) { diff --git a/r/src/vctr_builder_dbl.h b/r/src/vctr_builder_dbl.h index 9d1419ba2..7ada4195d 100644 --- a/r/src/vctr_builder_dbl.h +++ b/r/src/vctr_builder_dbl.h @@ -48,8 +48,9 @@ class DblBuilder : public VctrBuilder { return NANOARROW_OK; } - virtual ArrowErrorCode PushNext(const ArrowArray* array, ArrowError* error) override { - NANOARROW_RETURN_NOT_OK(VctrBuilder::PushNext(array, error)); + virtual ArrowErrorCode PushNext(SEXP array_shelter, const ArrowArray* array, + ArrowError* error) override { + NANOARROW_RETURN_NOT_OK(VctrBuilder::PushNext(array_shelter, array, error)); double* result = REAL(value_); int64_t n_bad_values = 0; diff --git a/r/src/vctr_builder_difftime.h b/r/src/vctr_builder_difftime.h index 6c6d00b42..aecdbdfe2 100644 --- a/r/src/vctr_builder_difftime.h +++ b/r/src/vctr_builder_difftime.h @@ -80,9 +80,10 @@ class DifftimeBuilder : public DblBuilder { return NANOARROW_OK; } - ArrowErrorCode PushNext(const ArrowArray* array, ArrowError* error) override { + ArrowErrorCode PushNext(SEXP array_shelter, const ArrowArray* array, + ArrowError* error) override { R_xlen_t value_size0 = value_size_; - NANOARROW_RETURN_NOT_OK(DblBuilder::PushNext(array, error)); + NANOARROW_RETURN_NOT_OK(DblBuilder::PushNext(array_shelter, array, error)); if (scale_ != 1) { double* result = REAL(value_); diff --git a/r/src/vctr_builder_int.h b/r/src/vctr_builder_int.h index b37d9d6ba..4a7633771 100644 --- a/r/src/vctr_builder_int.h +++ b/r/src/vctr_builder_int.h @@ -38,8 +38,9 @@ class IntBuilder : public VctrBuilder { return NANOARROW_OK; } - ArrowErrorCode PushNext(const ArrowArray* array, ArrowError* error) override { - NANOARROW_RETURN_NOT_OK(VctrBuilder::PushNext(array, error)); + ArrowErrorCode PushNext(SEXP array_shelter, const ArrowArray* array, + ArrowError* error) override { + NANOARROW_RETURN_NOT_OK(VctrBuilder::PushNext(array_shelter, array, error)); int* result = INTEGER(value_); int64_t n_bad_values = 0; diff --git a/r/src/vctr_builder_int64.h b/r/src/vctr_builder_int64.h index 211e70b76..e4d4a2859 100644 --- a/r/src/vctr_builder_int64.h +++ b/r/src/vctr_builder_int64.h @@ -39,8 +39,9 @@ class Integer64Builder : public VctrBuilder { return NANOARROW_OK; } - ArrowErrorCode PushNext(const ArrowArray* array, ArrowError* error) override { - NANOARROW_RETURN_NOT_OK(VctrBuilder::PushNext(array, error)); + ArrowErrorCode PushNext(SEXP array_shelter, const ArrowArray* array, + ArrowError* error) override { + NANOARROW_RETURN_NOT_OK(VctrBuilder::PushNext(array_shelter, array, error)); int64_t* result = reinterpret_cast(REAL(value_)); int64_t n_bad_values = 0; diff --git a/r/src/vctr_builder_lgl.h b/r/src/vctr_builder_lgl.h index ff5dd981d..f87fbd673 100644 --- a/r/src/vctr_builder_lgl.h +++ b/r/src/vctr_builder_lgl.h @@ -38,8 +38,9 @@ class LglBuilder : public VctrBuilder { return NANOARROW_OK; } - ArrowErrorCode PushNext(const ArrowArray* array, ArrowError* error) override { - NANOARROW_RETURN_NOT_OK(VctrBuilder::PushNext(array, error)); + ArrowErrorCode PushNext(SEXP array_shelter, const ArrowArray* array, + ArrowError* error) override { + NANOARROW_RETURN_NOT_OK(VctrBuilder::PushNext(array_shelter, array, error)); // True for all the types supported here const uint8_t* is_valid = array_view_.buffer_views[0].data.as_uint8; diff --git a/r/src/vctr_builder_other.h b/r/src/vctr_builder_other.h index 74f69dac9..fc35f0053 100644 --- a/r/src/vctr_builder_other.h +++ b/r/src/vctr_builder_other.h @@ -47,7 +47,8 @@ class OtherBuilder : public VctrBuilder { ArrowErrorCode Reserve(R_xlen_t n, ArrowError* error) override { return NANOARROW_OK; } - ArrowErrorCode PushNext(const ArrowArray* array, ArrowError* error) override { + ArrowErrorCode PushNext(SEXP array_shelter, const ArrowArray* array, + ArrowError* error) override { // Fill this in return NANOARROW_OK; } diff --git a/r/src/vctr_builder_posixct.h b/r/src/vctr_builder_posixct.h index ee0a38ccd..4ca5dff80 100644 --- a/r/src/vctr_builder_posixct.h +++ b/r/src/vctr_builder_posixct.h @@ -69,9 +69,10 @@ class PosixctBuilder : public DblBuilder { return NANOARROW_OK; } - ArrowErrorCode PushNext(const ArrowArray* array, ArrowError* error) override { + ArrowErrorCode PushNext(SEXP array_shelter, const ArrowArray* array, + ArrowError* error) override { R_xlen_t value_size0 = value_size_; - NANOARROW_RETURN_NOT_OK(DblBuilder::PushNext(array, error)); + NANOARROW_RETURN_NOT_OK(DblBuilder::PushNext(array_shelter, array, error)); if (scale_ != 1) { double* result = REAL(value_); diff --git a/r/src/vctr_builder_unspecified.h b/r/src/vctr_builder_unspecified.h index 0556a7c8b..1d39b2d07 100644 --- a/r/src/vctr_builder_unspecified.h +++ b/r/src/vctr_builder_unspecified.h @@ -50,7 +50,8 @@ class UnspecifiedBuilder : public VctrBuilder { return NANOARROW_OK; } - ArrowErrorCode PushNext(const ArrowArray* array, ArrowError* error) override { + ArrowErrorCode PushNext(SEXP array_shelter, const ArrowArray* array, + ArrowError* error) override { int64_t not_null_count; if (array->null_count == -1 && array->buffers[0] == nullptr) { not_null_count = array->length; From 16225e2d24e66d8ff4e0214f403460899d3ee5a6 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 14 Mar 2024 23:43:33 -0300 Subject: [PATCH 35/36] prototype method --- r/src/vctr_builder_other.h | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/r/src/vctr_builder_other.h b/r/src/vctr_builder_other.h index fc35f0053..df2aef644 100644 --- a/r/src/vctr_builder_other.h +++ b/r/src/vctr_builder_other.h @@ -49,24 +49,35 @@ class OtherBuilder : public VctrBuilder { ArrowErrorCode PushNext(SEXP array_shelter, const ArrowArray* array, ArrowError* error) override { - // Fill this in + SEXP schema_borrowed_xptr = PROTECT( + R_MakeExternalPtr(const_cast(schema_), R_NilValue, R_NilValue)); + Rf_setAttrib(schema_borrowed_xptr, R_ClassSymbol, nanoarrow_cls_schema); + + SEXP array_borrowed_xptr = PROTECT(R_MakeExternalPtr( + const_cast(array), schema_borrowed_xptr, array_shelter)); + Rf_setAttrib(array_borrowed_xptr, R_ClassSymbol, nanoarrow_cls_array); + + SEXP fun = PROTECT(Rf_install("convert_fallback_other")); + SEXP call = + PROTECT(Rf_lang5(fun, array_borrowed_xptr, R_NilValue, R_NilValue, ptype_sexp_)); + SEXP chunk_sexp = PROTECT(Rf_eval(call, nanoarrow_ns_pkg)); + Append(chunk_sexp); + UNPROTECT(5); + return NANOARROW_OK; } ArrowErrorCode Finish(ArrowError* error) override { if (chunks_tail_ == chunks_sexp_) { - Rprintf("zero chunks\n"); // Zero chunks (return the ptype) // Probably need to ensure the ptype has zero elements - SetValue(GetPtype()); + SetValue(ptype_sexp_); } else if (chunks_tail_ == CDR(chunks_sexp_)) { - Rprintf("one chunk\n"); // One chunk (return the chunk) SetValue(CAR(chunks_tail_)); } else { - Rprintf("many chunks\n"); // Many chunks (concatenate or rbind) SEXP fun; if (Rf_inherits(ptype_sexp_, "data.frame")) { From 15b38b6d51c88cb4b47bb609aee278726582f6ba Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 4 Jul 2024 04:43:46 -0300 Subject: [PATCH 36/36] fix init --- r/src/init.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/r/src/init.c b/r/src/init.c index b472cfbcb..1998b2357 100644 --- a/r/src/init.c +++ b/r/src/init.c @@ -92,12 +92,10 @@ extern SEXP nanoarrow_c_schema_set_dictionary(SEXP schema_mut_xptr, SEXP diction extern SEXP nanoarrow_c_preserved_count(void); extern SEXP nanoarrow_c_preserved_empty(void); extern SEXP nanoarrow_c_preserve_and_release_on_other_thread(SEXP obj); +extern SEXP nanoarrow_c_convert_array2(SEXP array_xptr, SEXP ptype_sexp); extern SEXP nanoarrow_c_vctr_chunk_offsets(SEXP array_list); extern SEXP nanoarrow_c_vctr_chunk_resolve(SEXP indices_sexp, SEXP offsets_sexp); extern SEXP nanoarrow_c_vctr_as_slice(SEXP indices_sexp); -extern SEXP nanoarrow_c_infer_ptype_using_builder(SEXP schema_xptr, SEXP ptype_sexp); -extern SEXP nanoarrow_c_infer_ptype_using_builder(SEXP schema_xptr); -extern SEXP nanoarrow_c_convert_array2(SEXP array_xptr, SEXP ptype_sexp); extern SEXP nanoarrow_c_version(void); extern SEXP nanoarrow_c_version_runtime(void); @@ -172,12 +170,10 @@ static const R_CallMethodDef CallEntries[] = { {"nanoarrow_c_preserved_empty", (DL_FUNC)&nanoarrow_c_preserved_empty, 0}, {"nanoarrow_c_preserve_and_release_on_other_thread", (DL_FUNC)&nanoarrow_c_preserve_and_release_on_other_thread, 1}, + {"nanoarrow_c_convert_array2", (DL_FUNC)&nanoarrow_c_convert_array2, 2}, {"nanoarrow_c_vctr_chunk_offsets", (DL_FUNC)&nanoarrow_c_vctr_chunk_offsets, 1}, {"nanoarrow_c_vctr_chunk_resolve", (DL_FUNC)&nanoarrow_c_vctr_chunk_resolve, 2}, {"nanoarrow_c_vctr_as_slice", (DL_FUNC)&nanoarrow_c_vctr_as_slice, 1}, - {"nanoarrow_c_infer_ptype_using_builder", - (DL_FUNC)&nanoarrow_c_infer_ptype_using_builder, 1}, - {"nanoarrow_c_convert_array2", (DL_FUNC)&nanoarrow_c_convert_array2, 2}, {"nanoarrow_c_version", (DL_FUNC)&nanoarrow_c_version, 0}, {"nanoarrow_c_version_runtime", (DL_FUNC)&nanoarrow_c_version_runtime, 0}, {NULL, NULL, 0}};