diff --git a/r/src/buffer.c b/r/src/buffer.c index 55e522288..f38148a16 100644 --- a/r/src/buffer.c +++ b/r/src/buffer.c @@ -23,6 +23,7 @@ #include "buffer.h" #include "nanoarrow.h" +#include "preserve.h" void finalize_buffer_xptr(SEXP buffer_xptr) { struct ArrowBuffer* buffer = (struct ArrowBuffer*)R_ExternalPtrAddr(buffer_xptr); diff --git a/r/src/buffer.h b/r/src/buffer.h index 2dcc49eb8..9d21443cf 100644 --- a/r/src/buffer.h +++ b/r/src/buffer.h @@ -22,6 +22,7 @@ #include #include "nanoarrow.h" +#include "preserve.h" #include "util.h" void finalize_buffer_xptr(SEXP buffer_xptr); diff --git a/r/src/infer_ptype.c b/r/src/infer_ptype.c index 1f5f8e042..0ffacf0df 100644 --- a/r/src/infer_ptype.c +++ b/r/src/infer_ptype.c @@ -34,7 +34,7 @@ // that later warns for out-of-range values (e.g., int64 to double()); // however, a user can use the convert_array(x, ptype = something_safer()) // when this occurs. -enum VectorType nanoarrow_infer_vector_type(enum ArrowType type) { +static enum VectorType nanoarrow_infer_vector_type(enum ArrowType type) { switch (type) { case NANOARROW_TYPE_BOOL: return VECTOR_TYPE_LGL; @@ -69,7 +69,7 @@ enum VectorType nanoarrow_infer_vector_type(enum ArrowType type) { } // The same as the above, but from a nanoarrow_schema() -enum VectorType nanoarrow_infer_vector_type_schema(SEXP schema_xptr) { +static enum VectorType nanoarrow_infer_vector_type_schema(SEXP schema_xptr) { struct ArrowSchema* schema = nanoarrow_schema_from_xptr(schema_xptr); struct ArrowSchemaView schema_view; @@ -89,67 +89,3 @@ enum VectorType nanoarrow_infer_vector_type_schema(SEXP schema_xptr) { enum VectorType nanoarrow_infer_vector_type_array(SEXP array_xptr) { return nanoarrow_infer_vector_type_schema(array_xptr_get_schema(array_xptr)); } - -// Call nanoarrow::infer_ptype_other(), which handles less common types that -// are easier to compute in R or gives an informative error if this is -// not possible. -static SEXP call_infer_ptype_other(SEXP schema_xptr) { - SEXP fun = PROTECT(Rf_install("infer_ptype_other")); - SEXP call = PROTECT(Rf_lang2(fun, schema_xptr)); - SEXP result = PROTECT(Rf_eval(call, nanoarrow_ns_pkg)); - UNPROTECT(3); - return result; -} - -SEXP nanoarrow_c_infer_ptype(SEXP schema_xptr); - -static SEXP infer_ptype_data_frame(SEXP schema_xptr) { - struct ArrowSchema* schema = nanoarrow_schema_from_xptr(schema_xptr); - SEXP result = PROTECT(Rf_allocVector(VECSXP, schema->n_children)); - SEXP result_names = PROTECT(Rf_allocVector(STRSXP, schema->n_children)); - - for (R_xlen_t i = 0; i < schema->n_children; i++) { - SEXP child_xptr = PROTECT(borrow_schema_child_xptr(schema_xptr, i)); - SET_VECTOR_ELT(result, i, nanoarrow_c_infer_ptype(child_xptr)); - UNPROTECT(1); - - struct ArrowSchema* child = schema->children[i]; - if (child->name != NULL) { - SET_STRING_ELT(result_names, i, Rf_mkCharCE(child->name, CE_UTF8)); - } else { - SET_STRING_ELT(result_names, i, Rf_mkChar("")); - } - } - - Rf_setAttrib(result, R_ClassSymbol, nanoarrow_cls_data_frame); - Rf_setAttrib(result, R_NamesSymbol, result_names); - SEXP rownames = PROTECT(Rf_allocVector(INTSXP, 2)); - INTEGER(rownames)[0] = NA_INTEGER; - INTEGER(rownames)[1] = 0; - Rf_setAttrib(result, R_RowNamesSymbol, rownames); - UNPROTECT(3); - return result; -} - -SEXP nanoarrow_c_infer_ptype(SEXP schema_xptr) { - enum VectorType vector_type = nanoarrow_infer_vector_type_schema(schema_xptr); - SEXP ptype = R_NilValue; - - switch (vector_type) { - case VECTOR_TYPE_LGL: - case VECTOR_TYPE_INT: - case VECTOR_TYPE_DBL: - case VECTOR_TYPE_CHR: - ptype = PROTECT(nanoarrow_alloc_type(vector_type, 0)); - break; - case VECTOR_TYPE_DATA_FRAME: - ptype = PROTECT(infer_ptype_data_frame(schema_xptr)); - break; - default: - ptype = PROTECT(call_infer_ptype_other(schema_xptr)); - break; - } - - UNPROTECT(1); - return ptype; -} diff --git a/r/src/init.c b/r/src/init.c index 69c943911..1998b2357 100644 --- a/r/src/init.c +++ b/r/src/init.c @@ -20,6 +20,7 @@ #include #include "altrep.h" +#include "preserve.h" #include "util.h" /* generated by tools/make-callentries.R */ @@ -91,6 +92,7 @@ extern SEXP nanoarrow_c_schema_set_dictionary(SEXP schema_mut_xptr, SEXP diction extern SEXP nanoarrow_c_preserved_count(void); extern SEXP nanoarrow_c_preserved_empty(void); extern SEXP nanoarrow_c_preserve_and_release_on_other_thread(SEXP obj); +extern SEXP nanoarrow_c_convert_array2(SEXP array_xptr, SEXP ptype_sexp); extern SEXP nanoarrow_c_vctr_chunk_offsets(SEXP array_list); extern SEXP nanoarrow_c_vctr_chunk_resolve(SEXP indices_sexp, SEXP offsets_sexp); extern SEXP nanoarrow_c_vctr_as_slice(SEXP indices_sexp); @@ -168,6 +170,7 @@ static const R_CallMethodDef CallEntries[] = { {"nanoarrow_c_preserved_empty", (DL_FUNC)&nanoarrow_c_preserved_empty, 0}, {"nanoarrow_c_preserve_and_release_on_other_thread", (DL_FUNC)&nanoarrow_c_preserve_and_release_on_other_thread, 1}, + {"nanoarrow_c_convert_array2", (DL_FUNC)&nanoarrow_c_convert_array2, 2}, {"nanoarrow_c_vctr_chunk_offsets", (DL_FUNC)&nanoarrow_c_vctr_chunk_offsets, 1}, {"nanoarrow_c_vctr_chunk_resolve", (DL_FUNC)&nanoarrow_c_vctr_chunk_resolve, 2}, {"nanoarrow_c_vctr_as_slice", (DL_FUNC)&nanoarrow_c_vctr_as_slice, 1}, diff --git a/r/src/materialize.h b/r/src/materialize.h index c3b2c5cbd..08e1e27cb 100644 --- a/r/src/materialize.h +++ b/r/src/materialize.h @@ -23,6 +23,10 @@ #include "materialize_common.h" +#ifdef __cplusplus +extern "C" { +#endif + // A heuristic to identify prototypes that should be treated like data frames // (i.e., including record-style vectors like POSIXct). This heuristic returns // true if ptype is a data.frame or is an S3 list with names. @@ -46,4 +50,8 @@ SEXP nanoarrow_materialize_realloc(SEXP ptype, R_xlen_t len); // nanoarrow_vctr conversion. int nanoarrow_materialize_finalize_result(SEXP converter_xptr); +#ifdef __cplusplus +} +#endif + #endif diff --git a/r/src/materialize_common.h b/r/src/materialize_common.h index 6c811b615..5c9b62d33 100644 --- a/r/src/materialize_common.h +++ b/r/src/materialize_common.h @@ -31,12 +31,14 @@ enum VectorType { VECTOR_TYPE_UNINITIALIZED, VECTOR_TYPE_NULL, + VECTOR_TYPE_RAW, VECTOR_TYPE_UNSPECIFIED, VECTOR_TYPE_LGL, VECTOR_TYPE_INT, VECTOR_TYPE_DBL, VECTOR_TYPE_ALTREP_CHR, VECTOR_TYPE_CHR, + VECTOR_TYPE_HMS, VECTOR_TYPE_POSIXCT, VECTOR_TYPE_DATE, VECTOR_TYPE_DIFFTIME, diff --git a/r/src/pointers.c b/r/src/pointers.c index 110029b8c..44f83a075 100644 --- a/r/src/pointers.c +++ b/r/src/pointers.c @@ -21,11 +21,9 @@ #include "array.h" #include "array_stream.h" +#include "preserve.h" #include "schema.h" -// More reliable way to stringify intptr_t on Windows using C++ -void intptr_as_string(intptr_t ptr_int, char* buf); - SEXP nanoarrow_c_allocate_schema(void) { return nanoarrow_schema_owning_xptr(); } SEXP nanoarrow_c_allocate_array(void) { return nanoarrow_array_owning_xptr(); } diff --git a/r/src/nanoarrow_cpp.cc b/r/src/preserve.cc similarity index 91% rename from r/src/nanoarrow_cpp.cc rename to r/src/preserve.cc index 9c0e38d68..7e62faf3a 100644 --- a/r/src/nanoarrow_cpp.cc +++ b/r/src/preserve.cc @@ -26,6 +26,8 @@ #include #include +#include "preserve.h" + // Without this infrastructure, it's possible to check that all objects // are released by running devtools::test(); gc() in a fresh session and // making sure that nanoarrow:::preserved_count() is zero afterward. @@ -35,7 +37,7 @@ #include #endif -extern "C" void intptr_as_string(intptr_t ptr_int, char* buf) { +void intptr_as_string(intptr_t ptr_int, char* buf) { std::string ptr_str = std::to_string(ptr_int); memcpy(buf, ptr_str.data(), ptr_str.size()); } @@ -166,13 +168,13 @@ class PreservedSEXPRegistry { #endif }; -extern "C" void nanoarrow_preserve_init(void) { PreservedSEXPRegistry::GetInstance(); } +void nanoarrow_preserve_init(void) { PreservedSEXPRegistry::GetInstance(); } -extern "C" void nanoarrow_preserve_sexp(SEXP obj) { +void nanoarrow_preserve_sexp(SEXP obj) { PreservedSEXPRegistry::GetInstance().preserve(obj); } -extern "C" void nanoarrow_release_sexp(SEXP obj) { +void nanoarrow_release_sexp(SEXP obj) { try { PreservedSEXPRegistry::GetInstance().release(obj); } catch (std::exception& e) { @@ -180,11 +182,11 @@ extern "C" void nanoarrow_release_sexp(SEXP obj) { } } -extern "C" int64_t nanoarrow_preserved_count(void) { +int64_t nanoarrow_preserved_count(void) { return PreservedSEXPRegistry::GetInstance().size(); } -extern "C" int64_t nanoarrow_preserved_empty(void) { +int64_t nanoarrow_preserved_empty(void) { try { return PreservedSEXPRegistry::GetInstance().empty_trash(); } catch (std::exception& e) { @@ -192,11 +194,11 @@ extern "C" int64_t nanoarrow_preserved_empty(void) { } } -extern "C" int nanoarrow_is_main_thread(void) { +int nanoarrow_is_main_thread(void) { return PreservedSEXPRegistry::GetInstance().is_main_thread(); } -extern "C" void nanoarrow_preserve_and_release_on_other_thread(SEXP obj) { +void nanoarrow_preserve_and_release_on_other_thread(SEXP obj) { nanoarrow_preserve_sexp(obj); std::thread worker([obj] { nanoarrow_release_sexp(obj); }); worker.join(); diff --git a/r/src/preserve.h b/r/src/preserve.h new file mode 100644 index 000000000..aeaed11d7 --- /dev/null +++ b/r/src/preserve.h @@ -0,0 +1,52 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef R_NANOARROW_PRESERVE_H_INCLUDED +#define R_NANOARROW_PRESERVE_H_INCLUDED + +#define R_NO_REMAP +#include +#include + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +// Not really related to preserve/release, but needs C++ +void intptr_as_string(intptr_t ptr_int, char* buf); + +// Internal abstractions for R_PreserveObject and R_ReleaseObject +// that provide an opportunity for debugging information about +// preserved object lifecycle and possible future optimizations. +// These implementations use C++ and live in nanoarrow_cpp.cc +void nanoarrow_preserve_init(void); +void nanoarrow_preserve_sexp(SEXP obj); +void nanoarrow_release_sexp(SEXP obj); +int64_t nanoarrow_preserved_count(void); +int64_t nanoarrow_preserved_empty(void); +int nanoarrow_is_main_thread(void); + +// For testing +void nanoarrow_preserve_and_release_on_other_thread(SEXP obj); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/r/src/util.c b/r/src/util.c index 6d4035ba6..6ef6d1669 100644 --- a/r/src/util.c +++ b/r/src/util.c @@ -19,6 +19,7 @@ #include #include +#include "preserve.h" #include "util.h" SEXP nanoarrow_ns_pkg = NULL; diff --git a/r/src/util.h b/r/src/util.h index d652330ed..14a37333a 100644 --- a/r/src/util.h +++ b/r/src/util.h @@ -36,20 +36,6 @@ extern SEXP nanoarrow_ptype_raw; void nanoarrow_init_cached_sexps(void); -// Internal abstractions for R_PreserveObject and R_ReleaseObject -// that provide an opportunity for debugging information about -// preserved object lifecycle and possible future optimizations. -// These implementations use C++ and live in nanoarrow_cpp.cc -void nanoarrow_preserve_init(void); -void nanoarrow_preserve_sexp(SEXP obj); -void nanoarrow_release_sexp(SEXP obj); -int64_t nanoarrow_preserved_count(void); -int64_t nanoarrow_preserved_empty(void); -int nanoarrow_is_main_thread(void); - -// For testing -void nanoarrow_preserve_and_release_on_other_thread(SEXP obj); - // Checker for very small mallocs() static inline void check_trivial_alloc(const void* ptr, const char* ptr_type) { if (ptr == NULL) { diff --git a/r/src/vctr_builder.cc b/r/src/vctr_builder.cc new file mode 100644 index 000000000..52b1dfb21 --- /dev/null +++ b/r/src/vctr_builder.cc @@ -0,0 +1,326 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#define R_NO_REMAP +#include +#include + +#include +#include + +#include "array.h" +#include "materialize.h" +#include "nanoarrow.h" +#include "nanoarrow/r.h" + +#include "vctr_builder.h" +#include "vctr_builder_base.h" +#include "vctr_builder_blob.h" +#include "vctr_builder_chr.h" +#include "vctr_builder_date.h" +#include "vctr_builder_dbl.h" +#include "vctr_builder_difftime.h" +#include "vctr_builder_hms.h" +#include "vctr_builder_int.h" +#include "vctr_builder_int64.h" +#include "vctr_builder_lgl.h" +#include "vctr_builder_list_of.h" +#include "vctr_builder_other.h" +#include "vctr_builder_posixct.h" +#include "vctr_builder_rcrd.h" +#include "vctr_builder_unspecified.h" + +// These conversions are the default R-native type guesses for +// an array that don't require extra information from the ptype (e.g., +// factor with levels). Some of these guesses may result in a conversion +// that later warns for out-of-range values (e.g., int64 to double()); +// however, a user can use the convert_array(x, ptype = something_safer()) +// when this occurs. +enum VectorType nanoarrow_infer_vector_type(enum ArrowType type) { + switch (type) { + case NANOARROW_TYPE_BOOL: + return VECTOR_TYPE_LGL; + + case NANOARROW_TYPE_INT8: + case NANOARROW_TYPE_UINT8: + case NANOARROW_TYPE_INT16: + case NANOARROW_TYPE_UINT16: + case NANOARROW_TYPE_INT32: + return VECTOR_TYPE_INT; + + case NANOARROW_TYPE_UINT32: + case NANOARROW_TYPE_INT64: + case NANOARROW_TYPE_UINT64: + case NANOARROW_TYPE_FLOAT: + case NANOARROW_TYPE_DOUBLE: + case NANOARROW_TYPE_DECIMAL128: + return VECTOR_TYPE_DBL; + + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_LARGE_STRING: + return VECTOR_TYPE_CHR; + + case NANOARROW_TYPE_DENSE_UNION: + case NANOARROW_TYPE_SPARSE_UNION: + case NANOARROW_TYPE_STRUCT: + return VECTOR_TYPE_DATA_FRAME; + + default: + return VECTOR_TYPE_OTHER; + } +} + +// Call nanoarrow::infer_ptype_other(), which handles less common types that +// are easier to compute in R or gives an informative error if this is +// not possible. +static SEXP call_infer_ptype_other(const ArrowSchema* schema) { + SEXP schema_xptr = PROTECT( + R_MakeExternalPtr(const_cast(schema), R_NilValue, R_NilValue)); + Rf_setAttrib(schema_xptr, R_ClassSymbol, nanoarrow_cls_schema); + + SEXP fun = PROTECT(Rf_install("infer_ptype_other")); + SEXP call = PROTECT(Rf_lang2(fun, schema_xptr)); + SEXP result = PROTECT(Rf_eval(call, nanoarrow_ns_pkg)); + UNPROTECT(4); + return result; +} + +// A base method for when we already have the VectorType and have already +// resolved the ptype_sexp (if needed). +static ArrowErrorCode InstantiateBuilderBase(const ArrowSchema* schema, + VectorType vector_type, SEXP ptype_sexp, + VctrBuilder** out, ArrowError* error) { + switch (vector_type) { + case VECTOR_TYPE_LGL: + *out = new LglBuilder(ptype_sexp); + return NANOARROW_OK; + case VECTOR_TYPE_INT: + *out = new IntBuilder(ptype_sexp); + return NANOARROW_OK; + case VECTOR_TYPE_DBL: + *out = new DblBuilder(ptype_sexp); + return NANOARROW_OK; + case VECTOR_TYPE_CHR: + *out = new ChrBuilder(ptype_sexp); + return NANOARROW_OK; + case VECTOR_TYPE_DATA_FRAME: + *out = new RcrdBuilder(ptype_sexp); + return NANOARROW_OK; + case VECTOR_TYPE_LIST_OF: + *out = new ListOfBuilder(ptype_sexp); + return NANOARROW_OK; + case VECTOR_TYPE_UNSPECIFIED: + *out = new UnspecifiedBuilder(ptype_sexp); + return NANOARROW_OK; + case VECTOR_TYPE_BLOB: + *out = new BlobBuilder(ptype_sexp); + return NANOARROW_OK; + case VECTOR_TYPE_DATE: + *out = new DateBuilder(ptype_sexp); + return NANOARROW_OK; + case VECTOR_TYPE_HMS: + *out = new HmsBuilder(ptype_sexp); + return NANOARROW_OK; + case VECTOR_TYPE_POSIXCT: + *out = new PosixctBuilder(ptype_sexp); + return NANOARROW_OK; + case VECTOR_TYPE_DIFFTIME: + *out = new DifftimeBuilder(ptype_sexp); + return NANOARROW_OK; + case VECTOR_TYPE_INTEGER64: + *out = new Integer64Builder(ptype_sexp); + return NANOARROW_OK; + case VECTOR_TYPE_OTHER: + *out = new OtherBuilder(ptype_sexp); + return NANOARROW_OK; + default: + Rf_error("Unknown vector type id: %d", (int)vector_type); + } +} + +// A version of the above but for when don't know the VectorType yet and +// for when we're not sure if we need to pop into R to infer a ptype. +ArrowErrorCode InstantiateBuilder(const ArrowSchema* schema, SEXP ptype_sexp, + VctrBuilderOptions options, VctrBuilder** out, + ArrowError* error) { + ArrowSchemaView view; + NANOARROW_RETURN_NOT_OK(ArrowSchemaViewInit(&view, schema, error)); + + // Extension types and dictionary types always need their ptype resolved in + // R and always need to use the VctrBuilderOther. This simplifies writing + // the builders (e.g., they do not all have to consider these cases). + if (view.extension_name.size_bytes > 0 || view.type == NANOARROW_TYPE_DICTIONARY) { + SEXP inferred_ptype_sexp = PROTECT(call_infer_ptype_other(schema)); + int code = InstantiateBuilderBase(schema, VECTOR_TYPE_OTHER, inferred_ptype_sexp, out, + error); + UNPROTECT(1); + return code; + } + + if (ptype_sexp == R_NilValue) { + // See if we can skip any ptype resolution at all + enum VectorType vector_type = nanoarrow_infer_vector_type(view.type); + switch (vector_type) { + case VECTOR_TYPE_LGL: + case VECTOR_TYPE_INT: + case VECTOR_TYPE_DBL: + case VECTOR_TYPE_CHR: + case VECTOR_TYPE_DATA_FRAME: + return InstantiateBuilderBase(schema, vector_type, R_NilValue, out, error); + default: + break; + } + + // Otherwise, resolve the ptype and use it (this will error for ptypes that can't be + // resolved) + SEXP inferred_ptype_sexp = PROTECT(call_infer_ptype_other(schema)); + + // Error if it returns null, since this would put us in an infinite loop + if (inferred_ptype_sexp == R_NilValue) { + ArrowErrorSet(error, "infer_nanoarrow_ptype() returned NULL"); + return EINVAL; + } + + int code = InstantiateBuilder(schema, inferred_ptype_sexp, options, out, error); + UNPROTECT(1); + return code; + } + + // Handle some S3 objects internally to avoid S3 dispatch (e.g., when looping over a + // data frame with a lot of columns) + enum VectorType vector_type = VECTOR_TYPE_OTHER; + if (Rf_isObject(ptype_sexp)) { + if (nanoarrow_ptype_is_data_frame(ptype_sexp)) { + vector_type = VECTOR_TYPE_DATA_FRAME; + } else if (Rf_inherits(ptype_sexp, "vctrs_unspecified")) { + vector_type = VECTOR_TYPE_UNSPECIFIED; + } else if (Rf_inherits(ptype_sexp, "blob")) { + vector_type = VECTOR_TYPE_BLOB; + } else if (Rf_inherits(ptype_sexp, "vctrs_list_of")) { + vector_type = VECTOR_TYPE_LIST_OF; + } else if (Rf_inherits(ptype_sexp, "Date")) { + vector_type = VECTOR_TYPE_DATE; + } else if (Rf_inherits(ptype_sexp, "hms")) { + vector_type = VECTOR_TYPE_HMS; + } else if (Rf_inherits(ptype_sexp, "POSIXct")) { + vector_type = VECTOR_TYPE_POSIXCT; + } else if (Rf_inherits(ptype_sexp, "difftime")) { + vector_type = VECTOR_TYPE_DIFFTIME; + } else if (Rf_inherits(ptype_sexp, "integer64")) { + vector_type = VECTOR_TYPE_INTEGER64; + } + } else { + // If we're here, these are non-S3 objects + switch (TYPEOF(ptype_sexp)) { + case RAWSXP: + vector_type = VECTOR_TYPE_RAW; + break; + case LGLSXP: + vector_type = VECTOR_TYPE_LGL; + break; + case INTSXP: + vector_type = VECTOR_TYPE_INT; + break; + case REALSXP: + vector_type = VECTOR_TYPE_DBL; + break; + case STRSXP: + vector_type = VECTOR_TYPE_CHR; + break; + } + } + + return InstantiateBuilderBase(schema, vector_type, ptype_sexp, out, error); +} + +// C API so that we can reuse these implementations elsewhere + +static void finalize_vctr_builder_xptr(SEXP vctr_builder_xptr) { + auto ptr = reinterpret_cast(R_ExternalPtrAddr(vctr_builder_xptr)); + if (ptr != nullptr) { + delete ptr; + } +} + +SEXP nanoarrow_vctr_builder_init(SEXP schema_xptr, SEXP ptype_sexp) { + struct ArrowSchema* schema = nanoarrow_schema_from_xptr(schema_xptr); + ArrowError error; + ArrowErrorInit(&error); + + // For now, no configurable options + VctrBuilderOptions options; + options.use_altrep = VCTR_BUILDER_USE_ALTREP_DEFAULT; + + // Wrap in an external pointer + SEXP vctr_builder_xptr = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, schema_xptr)); + R_RegisterCFinalizer(vctr_builder_xptr, &finalize_vctr_builder_xptr); + + // Instantiate the builder + VctrBuilder* vctr_builder = nullptr; + int code = InstantiateBuilder(schema, ptype_sexp, options, &vctr_builder, &error); + if (code != NANOARROW_OK) { + Rf_error("Failed to instantiate VctrBuilder: %s", error.message); + } + + R_SetExternalPtrAddr(vctr_builder_xptr, vctr_builder); + + // Initialize + code = vctr_builder->Init(schema, options, &error); + if (code != NANOARROW_OK) { + Rf_error("Failed to initialize VctrBuilder: %s", error.message); + } + + UNPROTECT(1); + return vctr_builder_xptr; +} + +SEXP nanoarrow_c_infer_ptype(SEXP schema_xptr) { + SEXP vctr_bulider_xptr = PROTECT(nanoarrow_vctr_builder_init(schema_xptr, R_NilValue)); + auto vctr_builder = + reinterpret_cast(R_ExternalPtrAddr(vctr_bulider_xptr)); + SEXP ptype_sexp = PROTECT(vctr_builder->GetPtype()); + UNPROTECT(2); + return ptype_sexp; +} + +SEXP nanoarrow_c_convert_array2(SEXP array_xptr, SEXP ptype_sexp) { + ArrowArray* array = nanoarrow_array_from_xptr(array_xptr); + SEXP schema_xptr = PROTECT(array_xptr_get_schema(array_xptr)); + SEXP builder_xptr = PROTECT(nanoarrow_vctr_builder_init(schema_xptr, ptype_sexp)); + auto builder = reinterpret_cast(R_ExternalPtrAddr(builder_xptr)); + + ArrowError error; + ArrowErrorInit(&error); + + int result = builder->Reserve(array->length, &error); + if (result != NANOARROW_OK) { + Rf_error("builder->Reserve() failed: %s", error.message); + } + + result = builder->PushNext(array_xptr, array, &error); + if (result != NANOARROW_OK) { + Rf_error("builder->PushNext() failed: %s", error.message); + } + + result = builder->Finish(&error); + if (result != NANOARROW_OK) { + Rf_error("builder->Finish() failed: %s", error.message); + } + + UNPROTECT(2); + return builder->GetValue(); +} diff --git a/r/src/vctr_builder.h b/r/src/vctr_builder.h new file mode 100644 index 000000000..91e16f1c4 --- /dev/null +++ b/r/src/vctr_builder.h @@ -0,0 +1,63 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef R_NANOARROW_VCTR_BUILDER_H_INCLUDED +#define R_NANOARROW_VCTR_BUILDER_H_INCLUDED + +#define R_NO_REMAP +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +// An opaque pointer to a C++-implemented collector that is instantiated +// using an ArrowSchema and (optionally) a SEXP ptype and is supplied +// zero or more ArrowArrays before computing the output. +struct VctrBuilder; + +// Options for when to use ALTREP. Currently ALTREP is only implemented +// for character() with exactly one input chunk. The default may eventually +// use some heuristics to decide if there is a likely performance advantage +// to deferring the conversion. +enum VctrBuilderUseAltrep { + VCTR_BUILDER_USE_ALTREP_DEFAULT = 0, + VCTR_BUILDER_USE_ALTREP_ALWAYS = 1, + VCTR_BUILDER_USE_ALTREP_NEVER = 2 +}; + +// Options controlling the details of how arrays are built. Note that +// this does not control the destination ptype: customizing ptype resolution +// is currently possible by passing a function to the `to` argument at the +// top level. Future additions could control the error/warning strategy +// for (potentially) lossy conversions. +struct VctrBuilderOptions { + enum VctrBuilderUseAltrep use_altrep; +}; + +SEXP nanoarrow_vctr_builder_init(SEXP schema_xptr, SEXP ptype_sexp); + +SEXP nanoarrow_c_infer_ptype(SEXP schema_xptr); + +SEXP nanoarrow_c_convert_array2(SEXP array_xptr, SEXP ptype_sexp); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/r/src/vctr_builder_base.h b/r/src/vctr_builder_base.h new file mode 100644 index 000000000..02597f5a4 --- /dev/null +++ b/r/src/vctr_builder_base.h @@ -0,0 +1,158 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef R_NANOARROW_VCTR_BUILDER_BASE_H_INCLUDED +#define R_NANOARROW_VCTR_BUILDER_BASE_H_INCLUDED + +#define R_NO_REMAP +#include +#include + +#include "materialize.h" +#include "nanoarrow.h" +#include "preserve.h" +#include "vctr_builder.h" + +struct VctrBuilder { + public: + // VctrBuilder instances are always created from a vector_type or a ptype. + // InstantiateBuilder() takes care of picking which subclass. The base class + // constructor takes these two arguments to provide consumer implementations + // for inspecting their value. This does not validate any ptypes (that would + // happen in Init() if needed). + VctrBuilder(VectorType vector_type, SEXP ptype_sexp) + : schema_(nullptr), + vector_type_(vector_type), + ptype_sexp_(R_NilValue), + value_(R_NilValue), + value_size_(0) { + ArrowArrayViewInitFromType(&array_view_, NANOARROW_TYPE_UNINITIALIZED); + nanoarrow_preserve_sexp(ptype_sexp); + ptype_sexp_ = ptype_sexp; + } + + // Enable generic containers like std::unique_ptr + virtual ~VctrBuilder() { + nanoarrow_release_sexp(ptype_sexp_); + nanoarrow_release_sexp(value_); + ArrowArrayViewReset(&array_view_); + } + + // Initialize this instance with the information available to the resolver, or the + // information that was inferred. If using the default `to`, ptype may be R_NilValue + // with Options containing the inferred information. Calling this method may longjmp. + // The implementation on the base class initialized the built-in ArrowArrayView and + // saves a reference to `schema` (but subclass implementations need not call it). + virtual ArrowErrorCode Init(const ArrowSchema* schema, VctrBuilderOptions options, + ArrowError* error) { + NANOARROW_RETURN_NOT_OK(ArrowSchemaViewInit(&schema_view_, schema, error)); + NANOARROW_RETURN_NOT_OK(ArrowArrayViewInitFromSchema(&array_view_, schema, error)); + schema_ = schema; + return NANOARROW_OK; + } + + virtual ArrowErrorCode Reserve(R_xlen_t n, ArrowError* error) { + if (value_ != R_NilValue) { + ArrowErrorSet(error, "VctrBuilder reallocation is not implemented"); + } + + return NANOARROW_OK; + } + + // Push an array into this builder and do not take ownership of array. This is + // called when the caller cannot safely relinquish ownership of an array (e.g., + // convert_array()). Calling this method may longjmp. + virtual ArrowErrorCode PushNext(SEXP array_shelter, const ArrowArray* array, + ArrowError* error) { + NANOARROW_RETURN_NOT_OK(ArrowArrayViewSetArray(&array_view_, array, error)); + return NANOARROW_OK; + } + + // Push an array into this builder. The implementation may (but is not required) to take + // ownership. This is called when the caller can relinquish ownership (e.g., + // convert_array_stream()). Calling this method may longjmp. + virtual ArrowErrorCode PushNextOwning(ArrowArray* array, ArrowError* error) { + return PushNext(R_NilValue, array, error); + } + + // Perform any final calculations required to calculate the return value. + // Calling this method may longjmp. + virtual ArrowErrorCode Finish(ArrowError* error) { + if (ptype_sexp_ != R_NilValue && value_ != R_NilValue) { + Rf_copyMostAttrib(ptype_sexp_, value_); + } + + return NANOARROW_OK; + } + + // Release the final value of the builder. Calling this method may longjmp. + virtual SEXP GetValue() { + SEXP value = PROTECT(value_); + nanoarrow_release_sexp(value_); + value_ = R_NilValue; + UNPROTECT(1); + return value; + } + + // Get (or allocate if required) the SEXP ptype for this output + virtual SEXP GetPtype() { return ptype_sexp_; } + + protected: + ArrowSchemaView schema_view_; + ArrowArrayView array_view_; + const ArrowSchema* schema_; + VectorType vector_type_; + SEXP ptype_sexp_; + SEXP value_; + R_xlen_t value_size_; + + // Could maybe avoid a preserve/protect + void SetValue(SEXP value) { + nanoarrow_release_sexp(value_); + value_ = value; + nanoarrow_preserve_sexp(value_); + } + + ArrowErrorCode WarnLossyConvert(const char* msg, int64_t count) { + SEXP fun = PROTECT(Rf_install("warn_lossy_conversion")); + SEXP count_sexp = PROTECT(Rf_ScalarReal((double)count)); + SEXP msg_sexp = PROTECT(Rf_mkString(msg)); + SEXP call = PROTECT(Rf_lang3(fun, count_sexp, msg_sexp)); + Rf_eval(call, nanoarrow_ns_pkg); + UNPROTECT(4); + return NANOARROW_OK; + } + + void StopCantConvert() { + SEXP fun = PROTECT(Rf_install("stop_cant_convert_schema")); + SEXP schema_xptr = PROTECT( + R_MakeExternalPtr(const_cast(schema_), R_NilValue, R_NilValue)); + Rf_setAttrib(schema_xptr, R_ClassSymbol, nanoarrow_cls_schema); + SEXP ptype_sexp = PROTECT(GetPtype()); + + SEXP call = PROTECT(Rf_lang3(fun, schema_xptr, ptype_sexp)); + Rf_eval(call, nanoarrow_ns_pkg); + UNPROTECT(4); + } +}; + +// Resolve a builder class from a schema and (optional) ptype and instantiate it +ArrowErrorCode InstantiateBuilder(const ArrowSchema* schema, SEXP ptype_sexp, + VctrBuilderOptions options, VctrBuilder** out, + ArrowError* error); + +#endif diff --git a/r/src/vctr_builder_blob.h b/r/src/vctr_builder_blob.h new file mode 100644 index 000000000..a47edf180 --- /dev/null +++ b/r/src/vctr_builder_blob.h @@ -0,0 +1,74 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef R_NANOARROW_VCTR_BUILDER_BLOB_H_INCLUDED +#define R_NANOARROW_VCTR_BUILDER_BLOB_H_INCLUDED + +#define R_NO_REMAP +#include +#include + +#include "vctr_builder_base.h" + +class BlobBuilder : public VctrBuilder { + public: + explicit BlobBuilder(SEXP ptype_sexp) : VctrBuilder(VECTOR_TYPE_BLOB, ptype_sexp) {} + + ArrowErrorCode Reserve(R_xlen_t n, ArrowError* error) override { + NANOARROW_RETURN_NOT_OK(VctrBuilder::Reserve(n, error)); + SEXP value = PROTECT(Rf_allocVector(VECSXP, n)); + SetValue(value); + UNPROTECT(1); + return NANOARROW_OK; + } + + ArrowErrorCode PushNext(SEXP array_shelter, const ArrowArray* array, + ArrowError* error) override { + NANOARROW_RETURN_NOT_OK(VctrBuilder::PushNext(array_shelter, array, error)); + R_xlen_t length = array_view_.length; + + switch (array_view_.storage_type) { + case NANOARROW_TYPE_NA: + // Works because lists are filled with R_NilValue by default + // when allocated. + return NANOARROW_OK; + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_LARGE_STRING: + case NANOARROW_TYPE_BINARY: + case NANOARROW_TYPE_LARGE_BINARY: + break; + default: + return ENOTSUP; + } + + struct ArrowBufferView item; + SEXP item_sexp; + for (R_xlen_t i = 0; i < length; i++) { + if (!ArrowArrayViewIsNull(&array_view_, i)) { + item = ArrowArrayViewGetBytesUnsafe(&array_view_, i); + item_sexp = PROTECT(Rf_allocVector(RAWSXP, item.size_bytes)); + memcpy(RAW(item_sexp), item.data.data, item.size_bytes); + SET_VECTOR_ELT(value_, value_size_ + i, item_sexp); + UNPROTECT(1); + } + } + + return NANOARROW_OK; + } +}; + +#endif diff --git a/r/src/vctr_builder_chr.h b/r/src/vctr_builder_chr.h new file mode 100644 index 000000000..4b51069e4 --- /dev/null +++ b/r/src/vctr_builder_chr.h @@ -0,0 +1,99 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef R_NANOARROW_VCTR_BUILDER_CHR_H_INCLUDED +#define R_NANOARROW_VCTR_BUILDER_CHR_H_INCLUDED + +#define R_NO_REMAP +#include +#include + +#include +#include + +#include "vctr_builder_base.h" + +class ChrBuilder : public VctrBuilder { + public: + explicit ChrBuilder(SEXP ptype_sexp) : VctrBuilder(VECTOR_TYPE_CHR, ptype_sexp) {} + + SEXP GetPtype() override { return Rf_allocVector(STRSXP, 0); } + + ArrowErrorCode Reserve(R_xlen_t n, ArrowError* error) override { + NANOARROW_RETURN_NOT_OK(VctrBuilder::Reserve(n, error)); + SEXP value = PROTECT(Rf_allocVector(STRSXP, n)); + SetValue(value); + UNPROTECT(1); + return NANOARROW_OK; + } + + ArrowErrorCode PushNext(SEXP array_shelter, const ArrowArray* array, + ArrowError* error) override { + NANOARROW_RETURN_NOT_OK(VctrBuilder::PushNext(array_shelter, array, error)); + R_xlen_t length = array_view_.length; + + switch (array_view_.storage_type) { + case NANOARROW_TYPE_NA: + for (R_xlen_t i = 0; i < length; i++) { + SET_STRING_ELT(value_, value_size_ + i, NA_STRING); + } + return NANOARROW_OK; + + case NANOARROW_TYPE_INT8: + case NANOARROW_TYPE_UINT8: + case NANOARROW_TYPE_INT16: + case NANOARROW_TYPE_UINT16: + case NANOARROW_TYPE_INT32: + case NANOARROW_TYPE_UINT32: + case NANOARROW_TYPE_INT64: { + char buf[64]; + for (R_xlen_t i = 0; i < length; i++) { + if (ArrowArrayViewIsNull(&array_view_, i)) { + SET_STRING_ELT(value_, value_size_ + i, NA_STRING); + } else { + int n_chars = snprintf(buf, sizeof(buf), "%" PRId64, + ArrowArrayViewGetIntUnsafe(&array_view_, i)); + SET_STRING_ELT(value_, value_size_ + i, + Rf_mkCharLenCE(buf, n_chars, CE_UTF8)); + } + } + return NANOARROW_OK; + } + + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_LARGE_STRING: { + struct ArrowStringView item; + for (R_xlen_t i = 0; i < length; i++) { + if (ArrowArrayViewIsNull(&array_view_, i)) { + SET_STRING_ELT(value_, value_size_ + i, NA_STRING); + } else { + item = ArrowArrayViewGetStringUnsafe(&array_view_, i); + SET_STRING_ELT(value_, value_size_ + i, + Rf_mkCharLenCE(item.data, (int)item.size_bytes, CE_UTF8)); + } + } + + return NANOARROW_OK; + } + + default: + return ENOTSUP; + } + } +}; + +#endif diff --git a/r/src/vctr_builder_date.h b/r/src/vctr_builder_date.h new file mode 100644 index 000000000..65ebf265f --- /dev/null +++ b/r/src/vctr_builder_date.h @@ -0,0 +1,46 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef R_NANOARROW_VCTR_BUILDER_DATE_H_INCLUDED +#define R_NANOARROW_VCTR_BUILDER_DATE_H_INCLUDED + +#define R_NO_REMAP +#include +#include + +#include "vctr_builder_dbl.h" + +class DateBuilder : public DblBuilder { + public: + explicit DateBuilder(SEXP ptype_sexp) : DblBuilder(ptype_sexp, VECTOR_TYPE_DATE) {} + + ArrowErrorCode Init(const ArrowSchema* schema, VctrBuilderOptions options, + ArrowError* error) { + NANOARROW_RETURN_NOT_OK(DblBuilder::Init(schema, options, error)); + switch (schema_view_.type) { + case NANOARROW_TYPE_NA: + case NANOARROW_TYPE_DATE32: + break; + default: + StopCantConvert(); + } + + return NANOARROW_OK; + } +}; + +#endif diff --git a/r/src/vctr_builder_dbl.h b/r/src/vctr_builder_dbl.h new file mode 100644 index 000000000..7ada4195d --- /dev/null +++ b/r/src/vctr_builder_dbl.h @@ -0,0 +1,142 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef R_NANOARROW_VCTR_BUILDER_DBL_H_INCLUDED +#define R_NANOARROW_VCTR_BUILDER_DBL_H_INCLUDED + +#define R_NO_REMAP +#include +#include + +#include "vctr_builder_base.h" + +// bit64::as.integer64(2^53) +#define MAX_DBL_AS_INTEGER 9007199254740992 + +class DblBuilder : public VctrBuilder { + public: + explicit DblBuilder(SEXP ptype_sexp, VectorType vector_type = VECTOR_TYPE_DBL) + : VctrBuilder(vector_type, ptype_sexp) {} + + SEXP GetPtype() override { + if (ptype_sexp_ != R_NilValue) { + return ptype_sexp_; + } else { + return Rf_allocVector(REALSXP, 0); + } + } + + ArrowErrorCode Reserve(R_xlen_t n, ArrowError* error) override { + NANOARROW_RETURN_NOT_OK(VctrBuilder::Reserve(n, error)); + SEXP value = PROTECT(Rf_allocVector(REALSXP, n)); + SetValue(value); + UNPROTECT(1); + return NANOARROW_OK; + } + + virtual ArrowErrorCode PushNext(SEXP array_shelter, const ArrowArray* array, + ArrowError* error) override { + NANOARROW_RETURN_NOT_OK(VctrBuilder::PushNext(array_shelter, array, error)); + + double* result = REAL(value_); + int64_t n_bad_values = 0; + + // True for all the types supported here + const uint8_t* is_valid = array_view_.buffer_views[0].data.as_uint8; + int64_t raw_src_offset = array_view_.offset; + R_xlen_t length = array_view_.length; + + // Fill the buffer + switch (array_view_.storage_type) { + case NANOARROW_TYPE_NA: + for (R_xlen_t i = 0; i < length; i++) { + result[value_size_ + i] = NA_REAL; + } + break; + case NANOARROW_TYPE_DOUBLE: + memcpy(result + value_size_, + array_view_.buffer_views[1].data.as_double + raw_src_offset, + length * sizeof(double)); + + // Set any nulls to NA_REAL + if (is_valid != NULL && array_view_.null_count != 0) { + for (R_xlen_t i = 0; i < length; i++) { + if (!ArrowBitGet(is_valid, raw_src_offset + i)) { + result[value_size_ + i] = NA_REAL; + } + } + } + break; + case NANOARROW_TYPE_BOOL: + case NANOARROW_TYPE_INT8: + case NANOARROW_TYPE_UINT8: + case NANOARROW_TYPE_INT16: + case NANOARROW_TYPE_UINT16: + case NANOARROW_TYPE_INT32: + case NANOARROW_TYPE_UINT32: + case NANOARROW_TYPE_FLOAT: + // No need to bounds check these types + for (R_xlen_t i = 0; i < length; i++) { + result[value_size_ + i] = ArrowArrayViewGetDoubleUnsafe(&array_view_, i); + } + + // Set any nulls to NA_REAL + if (is_valid != NULL && array_view_.null_count != 0) { + for (R_xlen_t i = 0; i < length; i++) { + if (!ArrowBitGet(is_valid, raw_src_offset + i)) { + result[value_size_ + i] = NA_REAL; + } + } + } + break; + + case NANOARROW_TYPE_INT64: + case NANOARROW_TYPE_UINT64: + for (R_xlen_t i = 0; i < length; i++) { + double value = ArrowArrayViewGetDoubleUnsafe(&array_view_, i); + if (value > MAX_DBL_AS_INTEGER || value < -MAX_DBL_AS_INTEGER) { + // Content of null slot is undefined + n_bad_values += is_valid == NULL || ArrowBitGet(is_valid, raw_src_offset + i); + } + + result[value_size_ + i] = value; + } + + // Set any nulls to NA_REAL + if (is_valid != NULL && array_view_.null_count != 0) { + for (R_xlen_t i = 0; i < length; i++) { + if (!ArrowBitGet(is_valid, raw_src_offset + i)) { + result[value_size_ + i] = NA_REAL; + } + } + } + break; + + default: + return EINVAL; + } + + if (n_bad_values > 0) { + WarnLossyConvert("may have incurred loss of precision in conversion to double()", + n_bad_values); + } + + return NANOARROW_OK; + } +}; + +#endif diff --git a/r/src/vctr_builder_difftime.h b/r/src/vctr_builder_difftime.h new file mode 100644 index 000000000..aecdbdfe2 --- /dev/null +++ b/r/src/vctr_builder_difftime.h @@ -0,0 +1,126 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef R_NANOARROW_VCTR_BUILDER_DIFFTIME_H_INCLUDED +#define R_NANOARROW_VCTR_BUILDER_DIFFTIME_H_INCLUDED + +#define R_NO_REMAP +#include +#include + +#include "vctr_builder_dbl.h" + +class DifftimeBuilder : public DblBuilder { + public: + explicit DifftimeBuilder(SEXP ptype_sexp, VectorType vector_type = VECTOR_TYPE_DIFFTIME) + : DblBuilder(ptype_sexp, vector_type), scale_(0) {} + + ArrowErrorCode Init(const ArrowSchema* schema, VctrBuilderOptions options, + ArrowError* error) override { + NANOARROW_RETURN_NOT_OK(DblBuilder::Init(schema, options, error)); + switch (schema_view_.type) { + case NANOARROW_TYPE_NA: + case NANOARROW_TYPE_TIME32: + case NANOARROW_TYPE_TIME64: + case NANOARROW_TYPE_DURATION: + break; + default: + StopCantConvert(); + } + + switch (GetTimeUnits(ptype_sexp_)) { + case R_TIME_UNIT_MINUTES: + scale_ = 1.0 / 60; + break; + case R_TIME_UNIT_HOURS: + scale_ = 1.0 / (60 * 60); + break; + case R_TIME_UNIT_DAYS: + scale_ = 1.0 / (60 * 60 * 24); + break; + case R_TIME_UNIT_WEEKS: + scale_ = 1.0 / (60 * 60 * 24 * 7); + break; + default: + scale_ = 1.0; + break; + } + + switch (schema_view_.time_unit) { + case NANOARROW_TIME_UNIT_SECOND: + scale_ *= 1; + break; + case NANOARROW_TIME_UNIT_MILLI: + scale_ *= 1e-3; + break; + case NANOARROW_TIME_UNIT_MICRO: + scale_ *= 1e-6; + break; + case NANOARROW_TIME_UNIT_NANO: + scale_ *= 1e-9; + break; + default: + return EINVAL; + } + + return NANOARROW_OK; + } + + ArrowErrorCode PushNext(SEXP array_shelter, const ArrowArray* array, + ArrowError* error) override { + R_xlen_t value_size0 = value_size_; + NANOARROW_RETURN_NOT_OK(DblBuilder::PushNext(array_shelter, array, error)); + + if (scale_ != 1) { + double* result = REAL(value_); + for (int64_t i = 0; i < array_view_.length; i++) { + result[value_size0 + i] = result[value_size0 + i] * scale_; + } + } + + return NANOARROW_OK; + } + + private: + double scale_; + + static RTimeUnits GetTimeUnits(SEXP ptype) { + SEXP units_attr = Rf_getAttrib(ptype, Rf_install("units")); + if (units_attr == R_NilValue || TYPEOF(units_attr) != STRSXP || + Rf_length(units_attr) != 1) { + Rf_error("Expected difftime 'units' attribute of type character(1)"); + } + + const char* dst_units = Rf_translateCharUTF8(STRING_ELT(units_attr, 0)); + if (strcmp(dst_units, "secs") == 0) { + return R_TIME_UNIT_SECONDS; + } else if (strcmp(dst_units, "mins") == 0) { + return R_TIME_UNIT_MINUTES; + } else if (strcmp(dst_units, "hours") == 0) { + return R_TIME_UNIT_HOURS; + } else if (strcmp(dst_units, "days") == 0) { + return R_TIME_UNIT_DAYS; + } else if (strcmp(dst_units, "weeks") == 0) { + return R_TIME_UNIT_WEEKS; + } else { + Rf_error("Unexpected value for difftime 'units' attribute"); + return R_TIME_UNIT_SECONDS; + } + } +}; + +#endif diff --git a/r/src/vctr_builder_hms.h b/r/src/vctr_builder_hms.h new file mode 100644 index 000000000..bd2052f31 --- /dev/null +++ b/r/src/vctr_builder_hms.h @@ -0,0 +1,47 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef R_NANOARROW_VCTR_BUILDER_HMS_H_INCLUDED +#define R_NANOARROW_VCTR_BUILDER_HMS_H_INCLUDED + +#define R_NO_REMAP +#include +#include + +#include "vctr_builder_difftime.h" + +class HmsBuilder : public DifftimeBuilder { + public: + explicit HmsBuilder(SEXP ptype_sexp) : DifftimeBuilder(ptype_sexp, VECTOR_TYPE_HMS) {} + + ArrowErrorCode Init(const ArrowSchema* schema, VctrBuilderOptions options, + ArrowError* error) { + NANOARROW_RETURN_NOT_OK(DifftimeBuilder::Init(schema, options, error)); + switch (schema_view_.type) { + case NANOARROW_TYPE_NA: + case NANOARROW_TYPE_TIME32: + case NANOARROW_TYPE_TIME64: + break; + default: + StopCantConvert(); + } + + return NANOARROW_OK; + } +}; + +#endif diff --git a/r/src/vctr_builder_int.h b/r/src/vctr_builder_int.h new file mode 100644 index 000000000..4a7633771 --- /dev/null +++ b/r/src/vctr_builder_int.h @@ -0,0 +1,151 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef R_NANOARROW_VCTR_BUILDER_INT_H_INCLUDED +#define R_NANOARROW_VCTR_BUILDER_INT_H_INCLUDED + +#define R_NO_REMAP +#include +#include + +#include "vctr_builder_base.h" + +class IntBuilder : public VctrBuilder { + public: + explicit IntBuilder(SEXP ptype_sexp) : VctrBuilder(VECTOR_TYPE_INT, ptype_sexp) {} + + SEXP GetPtype() override { return Rf_allocVector(INTSXP, 0); } + + ArrowErrorCode Reserve(R_xlen_t n, ArrowError* error) override { + NANOARROW_RETURN_NOT_OK(VctrBuilder::Reserve(n, error)); + SEXP value = PROTECT(Rf_allocVector(INTSXP, n)); + SetValue(value); + UNPROTECT(1); + return NANOARROW_OK; + } + + ArrowErrorCode PushNext(SEXP array_shelter, const ArrowArray* array, + ArrowError* error) override { + NANOARROW_RETURN_NOT_OK(VctrBuilder::PushNext(array_shelter, array, error)); + + int* result = INTEGER(value_); + int64_t n_bad_values = 0; + + // True for all the types supported here + const uint8_t* is_valid = array_view_.buffer_views[0].data.as_uint8; + int64_t raw_src_offset = array_view_.offset; + R_xlen_t length = array->length; + + // Fill the buffer + switch (array_view_.storage_type) { + case NANOARROW_TYPE_NA: + for (R_xlen_t i = 0; i < length; i++) { + result[value_size_ + i] = NA_INTEGER; + } + break; + case NANOARROW_TYPE_INT32: + memcpy(result + value_size_, + array_view_.buffer_views[1].data.as_int32 + raw_src_offset, + length * sizeof(int32_t)); + + // Set any nulls to NA_INTEGER + if (is_valid != NULL && array_view_.null_count != 0) { + for (R_xlen_t i = 0; i < length; i++) { + if (!ArrowBitGet(is_valid, raw_src_offset + i)) { + result[value_size_ + i] = NA_INTEGER; + } + } + } + break; + case NANOARROW_TYPE_BOOL: + ArrowBitsUnpackInt32(array_view_.buffer_views[1].data.as_uint8 + raw_src_offset, + raw_src_offset, length, result + value_size_); + + // Set any nulls to NA_LOGICAL + if (is_valid != NULL && array_view_.null_count != 0) { + for (R_xlen_t i = 0; i < length; i++) { + if (!ArrowBitGet(is_valid, raw_src_offset + i)) { + result[value_size_ + i] = NA_LOGICAL; + } + } + } + break; + case NANOARROW_TYPE_INT8: + case NANOARROW_TYPE_UINT8: + case NANOARROW_TYPE_INT16: + case NANOARROW_TYPE_UINT16: + // No need to bounds check for these types + for (R_xlen_t i = 0; i < length; i++) { + result[value_size_ + i] = (int32_t)ArrowArrayViewGetIntUnsafe(&array_view_, i); + } + + // Set any nulls to NA_INTEGER + if (is_valid != NULL && array_view_.null_count != 0) { + for (R_xlen_t i = 0; i < length; i++) { + if (!ArrowBitGet(is_valid, raw_src_offset + i)) { + result[value_size_ + i] = NA_INTEGER; + } + } + } + break; + case NANOARROW_TYPE_UINT32: + case NANOARROW_TYPE_INT64: + case NANOARROW_TYPE_UINT64: + case NANOARROW_TYPE_FLOAT: + case NANOARROW_TYPE_DOUBLE: + // Loop + bounds check. Because we don't know what memory might be + // in a null slot, we have to check nulls if there are any. + if (is_valid != NULL && array_view_.null_count != 0) { + for (R_xlen_t i = 0; i < length; i++) { + if (ArrowBitGet(is_valid, raw_src_offset + i)) { + int64_t value = ArrowArrayViewGetIntUnsafe(&array_view_, i); + if (value > INT_MAX || value <= NA_INTEGER) { + result[value_size_ + i] = NA_INTEGER; + n_bad_values++; + } else { + result[value_size_ + i] = (int32_t)value; + } + } else { + result[value_size_ + i] = NA_INTEGER; + } + } + } else { + for (R_xlen_t i = 0; i < length; i++) { + int64_t value = ArrowArrayViewGetIntUnsafe(&array_view_, i); + if (value > INT_MAX || value <= NA_INTEGER) { + result[value_size_ + i] = NA_INTEGER; + n_bad_values++; + } else { + result[value_size_ + i] = (int32_t)value; + } + } + } + break; + + default: + return EINVAL; + } + + if (n_bad_values > 0) { + WarnLossyConvert("outside integer range set to NA", n_bad_values); + } + + return NANOARROW_OK; + } +}; + +#endif diff --git a/r/src/vctr_builder_int64.h b/r/src/vctr_builder_int64.h new file mode 100644 index 000000000..e4d4a2859 --- /dev/null +++ b/r/src/vctr_builder_int64.h @@ -0,0 +1,140 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef R_NANOARROW_VCTR_BUILDER_INT64_H_INCLUDED +#define R_NANOARROW_VCTR_BUILDER_INT64_H_INCLUDED + +#define R_NO_REMAP +#include +#include + +#include "vctr_builder_base.h" + +#define NA_INTEGER64 INT64_MIN + +class Integer64Builder : public VctrBuilder { + public: + explicit Integer64Builder(SEXP ptype_sexp) + : VctrBuilder(VECTOR_TYPE_INTEGER64, ptype_sexp) {} + + ArrowErrorCode Reserve(R_xlen_t n, ArrowError* error) override { + NANOARROW_RETURN_NOT_OK(VctrBuilder::Reserve(n, error)); + SEXP value = PROTECT(Rf_allocVector(REALSXP, n)); + SetValue(value); + UNPROTECT(1); + return NANOARROW_OK; + } + + ArrowErrorCode PushNext(SEXP array_shelter, const ArrowArray* array, + ArrowError* error) override { + NANOARROW_RETURN_NOT_OK(VctrBuilder::PushNext(array_shelter, array, error)); + + int64_t* result = reinterpret_cast(REAL(value_)); + int64_t n_bad_values = 0; + + // True for all the types supported here + const uint8_t* is_valid = array_view_.buffer_views[0].data.as_uint8; + int64_t raw_src_offset = array_view_.offset; + R_xlen_t length = array->length; + + // Fill the buffer + switch (array_view_.storage_type) { + case NANOARROW_TYPE_NA: + for (R_xlen_t i = 0; i < length; i++) { + result[value_size_ + i] = NA_INTEGER64; + } + break; + case NANOARROW_TYPE_INT64: + memcpy(result + value_size_, + array_view_.buffer_views[1].data.as_int32 + raw_src_offset, + length * sizeof(int64_t)); + + // Set any nulls to NA_INTEGER64 + if (is_valid != NULL && array_view_.null_count != 0) { + for (R_xlen_t i = 0; i < length; i++) { + if (!ArrowBitGet(is_valid, raw_src_offset + i)) { + result[value_size_ + i] = NA_INTEGER64; + } + } + } + break; + case NANOARROW_TYPE_BOOL: + case NANOARROW_TYPE_INT8: + case NANOARROW_TYPE_UINT8: + case NANOARROW_TYPE_INT16: + case NANOARROW_TYPE_UINT16: + case NANOARROW_TYPE_INT32: + case NANOARROW_TYPE_UINT32: + // No need to bounds check for these types + for (R_xlen_t i = 0; i < length; i++) { + result[value_size_ + i] = ArrowArrayViewGetIntUnsafe(&array_view_, i); + } + + // Set any nulls to NA_INTEGER + if (is_valid != NULL && array_view_.null_count != 0) { + for (R_xlen_t i = 0; i < length; i++) { + if (!ArrowBitGet(is_valid, raw_src_offset + i)) { + result[value_size_ + i] = NA_INTEGER64; + } + } + } + break; + case NANOARROW_TYPE_UINT64: + case NANOARROW_TYPE_FLOAT: + case NANOARROW_TYPE_DOUBLE: + // Loop + bounds check. Because we don't know what memory might be + // in a null slot, we have to check nulls if there are any. + if (is_valid != NULL && array_view_.null_count != 0) { + for (R_xlen_t i = 0; i < length; i++) { + if (ArrowBitGet(is_valid, raw_src_offset + i)) { + int64_t value = ArrowArrayViewGetIntUnsafe(&array_view_, i); + if (value > INT64_MAX || value <= NA_INTEGER64) { + result[value_size_ + i] = NA_INTEGER64; + n_bad_values++; + } else { + result[value_size_ + i] = value; + } + } else { + result[value_size_ + i] = NA_INTEGER64; + } + } + } else { + for (R_xlen_t i = 0; i < length; i++) { + int64_t value = ArrowArrayViewGetIntUnsafe(&array_view_, i); + if (value > INT64_MAX || value <= NA_INTEGER64) { + result[value_size_ + i] = NA_INTEGER64; + n_bad_values++; + } else { + result[value_size_ + i] = value; + } + } + } + break; + + default: + return EINVAL; + } + + if (n_bad_values > 0) { + WarnLossyConvert("outside integer64 range set to NA", n_bad_values); + } + + return NANOARROW_OK; + } +}; + +#endif diff --git a/r/src/vctr_builder_lgl.h b/r/src/vctr_builder_lgl.h new file mode 100644 index 000000000..f87fbd673 --- /dev/null +++ b/r/src/vctr_builder_lgl.h @@ -0,0 +1,104 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef R_NANOARROW_VCTR_BUILDER_LGL_H_INCLUDED +#define R_NANOARROW_VCTR_BUILDER_LGL_H_INCLUDED + +#define R_NO_REMAP +#include +#include + +#include "vctr_builder_base.h" + +class LglBuilder : public VctrBuilder { + public: + explicit LglBuilder(SEXP ptype_sexp) : VctrBuilder(VECTOR_TYPE_LGL, ptype_sexp) {} + + SEXP GetPtype() override { return Rf_allocVector(LGLSXP, 0); } + + ArrowErrorCode Reserve(R_xlen_t n, ArrowError* error) override { + NANOARROW_RETURN_NOT_OK(VctrBuilder::Reserve(n, error)); + SEXP value = PROTECT(Rf_allocVector(LGLSXP, n)); + SetValue(value); + UNPROTECT(1); + return NANOARROW_OK; + } + + ArrowErrorCode PushNext(SEXP array_shelter, const ArrowArray* array, + ArrowError* error) override { + NANOARROW_RETURN_NOT_OK(VctrBuilder::PushNext(array_shelter, array, error)); + + // True for all the types supported here + const uint8_t* is_valid = array_view_.buffer_views[0].data.as_uint8; + const uint8_t* data_buffer = array_view_.buffer_views[1].data.as_uint8; + + int64_t raw_src_offset = array_view_.offset; + R_xlen_t length = array->length; + int* result = LOGICAL(value_); + + // Fill the buffer + switch (array_view_.storage_type) { + case NANOARROW_TYPE_NA: + for (R_xlen_t i = 0; i < length; i++) { + result[value_size_ + i] = NA_LOGICAL; + } + break; + case NANOARROW_TYPE_BOOL: + ArrowBitsUnpackInt32(data_buffer, raw_src_offset, length, result + value_size_); + + // Set any nulls to NA_LOGICAL + if (is_valid != NULL && array_view_.null_count != 0) { + for (R_xlen_t i = 0; i < length; i++) { + if (!ArrowBitGet(is_valid, raw_src_offset + i)) { + result[value_size_ + i] = NA_LOGICAL; + } + } + } + break; + case NANOARROW_TYPE_INT8: + case NANOARROW_TYPE_UINT8: + case NANOARROW_TYPE_INT16: + case NANOARROW_TYPE_UINT16: + case NANOARROW_TYPE_INT32: + case NANOARROW_TYPE_UINT32: + case NANOARROW_TYPE_INT64: + case NANOARROW_TYPE_UINT64: + case NANOARROW_TYPE_FLOAT: + case NANOARROW_TYPE_DOUBLE: + for (R_xlen_t i = 0; i < length; i++) { + result[value_size_ + i] = ArrowArrayViewGetIntUnsafe(&array_view_, i) != 0; + } + + // Set any nulls to NA_LOGICAL + if (is_valid != NULL && array_view_.null_count != 0) { + for (R_xlen_t i = 0; i < length; i++) { + if (!ArrowBitGet(is_valid, raw_src_offset + i)) { + result[value_size_ + i] = NA_LOGICAL; + } + } + } + break; + + default: + return EINVAL; + } + + return NANOARROW_OK; + } +}; + +#endif diff --git a/r/src/vctr_builder_list_of.h b/r/src/vctr_builder_list_of.h new file mode 100644 index 000000000..e100fa469 --- /dev/null +++ b/r/src/vctr_builder_list_of.h @@ -0,0 +1,33 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef R_NANOARROW_VCTR_BUILDER_LIST_OF_H_INCLUDED +#define R_NANOARROW_VCTR_BUILDER_LIST_OF_H_INCLUDED + +#define R_NO_REMAP +#include +#include + +#include "vctr_builder_base.h" + +class ListOfBuilder : public VctrBuilder { + public: + explicit ListOfBuilder(SEXP ptype_sexp) + : VctrBuilder(VECTOR_TYPE_LIST_OF, ptype_sexp) {} +}; + +#endif diff --git a/r/src/vctr_builder_other.h b/r/src/vctr_builder_other.h new file mode 100644 index 000000000..df2aef644 --- /dev/null +++ b/r/src/vctr_builder_other.h @@ -0,0 +1,125 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef R_NANOARROW_VCTR_BUILDER_OTHER_H_INCLUDED +#define R_NANOARROW_VCTR_BUILDER_OTHER_H_INCLUDED + +#define R_NO_REMAP +#include +#include + +#include "vctr_builder_base.h" + +// If we've ended up here, we need to call in to R to convert this stream +// of arrays into an R vector. Currently, the S3 generic that implements +// this is convert_array(), so we have to do this one array at a time. +// The current conversions that are implemented this way internally are +// factor(), decimal, and + extension types/dictionary. +// +// An early version of this reimplemented a good chunk of vctrs-like internals +// to allow a generic preallocate where each chunk would be copied in to the +// preallocated vector. This version just converts each chunk as it comes +// and calls c(); however, eventually the generic should be +// convert_array_stream() to give implementations in other packages the ability +// to handle converting more than one array at a time. +class OtherBuilder : public VctrBuilder { + public: + explicit OtherBuilder(SEXP ptype_sexp) + : VctrBuilder(VECTOR_TYPE_OTHER, ptype_sexp), + chunks_sexp_(R_NilValue), + chunks_tail_(R_NilValue) {} + + ~OtherBuilder() { nanoarrow_release_sexp(chunks_sexp_); } + + ArrowErrorCode Reserve(R_xlen_t n, ArrowError* error) override { return NANOARROW_OK; } + + ArrowErrorCode PushNext(SEXP array_shelter, const ArrowArray* array, + ArrowError* error) override { + SEXP schema_borrowed_xptr = PROTECT( + R_MakeExternalPtr(const_cast(schema_), R_NilValue, R_NilValue)); + Rf_setAttrib(schema_borrowed_xptr, R_ClassSymbol, nanoarrow_cls_schema); + + SEXP array_borrowed_xptr = PROTECT(R_MakeExternalPtr( + const_cast(array), schema_borrowed_xptr, array_shelter)); + Rf_setAttrib(array_borrowed_xptr, R_ClassSymbol, nanoarrow_cls_array); + + SEXP fun = PROTECT(Rf_install("convert_fallback_other")); + SEXP call = + PROTECT(Rf_lang5(fun, array_borrowed_xptr, R_NilValue, R_NilValue, ptype_sexp_)); + SEXP chunk_sexp = PROTECT(Rf_eval(call, nanoarrow_ns_pkg)); + Append(chunk_sexp); + UNPROTECT(5); + + return NANOARROW_OK; + } + + ArrowErrorCode Finish(ArrowError* error) override { + if (chunks_tail_ == chunks_sexp_) { + // Zero chunks (return the ptype) + // Probably need to ensure the ptype has zero elements + SetValue(ptype_sexp_); + + } else if (chunks_tail_ == CDR(chunks_sexp_)) { + // One chunk (return the chunk) + SetValue(CAR(chunks_tail_)); + + } else { + // Many chunks (concatenate or rbind) + SEXP fun; + if (Rf_inherits(ptype_sexp_, "data.frame")) { + fun = PROTECT(Rf_install("rbind")); + } else { + fun = PROTECT(Rf_install("c")); + } + + SETCAR(chunks_sexp_, fun); + UNPROTECT(1); + + SEXP result = PROTECT(Rf_eval(chunks_sexp_, R_BaseEnv)); + SetValue(result); + UNPROTECT(1); + } + + nanoarrow_release_sexp(chunks_sexp_); + chunks_sexp_ = R_NilValue; + chunks_tail_ = R_NilValue; + return NANOARROW_OK; + } + + private: + SEXP chunks_sexp_; + SEXP chunks_tail_; + + void Append(SEXP chunk_sexp) { + if (chunks_sexp_ == R_NilValue) { + // Not sure if we will need no function, c, or rbind when we + // create this, so leave it as R_NilValue for now. + SEXP chunks_init = PROTECT(Rf_lang1(R_NilValue)); + chunks_sexp_ = chunks_init; + nanoarrow_preserve_sexp(chunks_sexp_); + chunks_tail_ = chunks_sexp_; + UNPROTECT(1); + } + + SEXP next_sexp = PROTECT(Rf_lcons(chunk_sexp, R_NilValue)); + SETCDR(chunks_tail_, next_sexp); + UNPROTECT(1); + chunks_tail_ = next_sexp; + } +}; + +#endif diff --git a/r/src/vctr_builder_posixct.h b/r/src/vctr_builder_posixct.h new file mode 100644 index 000000000..4ca5dff80 --- /dev/null +++ b/r/src/vctr_builder_posixct.h @@ -0,0 +1,91 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef R_NANOARROW_VCTR_BUILDER_POSIXCT_H_INCLUDED +#define R_NANOARROW_VCTR_BUILDER_POSIXCT_H_INCLUDED + +#define R_NO_REMAP +#include +#include + +#include "vctr_builder_dbl.h" + +class PosixctBuilder : public DblBuilder { + public: + explicit PosixctBuilder(SEXP ptype_sexp) + : DblBuilder(ptype_sexp, VECTOR_TYPE_POSIXCT), scale_(0) {} + + ArrowErrorCode Init(const ArrowSchema* schema, VctrBuilderOptions options, + ArrowError* error) override { + NANOARROW_RETURN_NOT_OK(DblBuilder::Init(schema, options, error)); + + ArrowTimeUnit time_unit = NANOARROW_TIME_UNIT_SECOND; + switch (schema_view_.type) { + case NANOARROW_TYPE_NA: + break; + case NANOARROW_TYPE_DATE64: + time_unit = NANOARROW_TIME_UNIT_MILLI; + break; + case NANOARROW_TYPE_TIMESTAMP: + time_unit = schema_view_.time_unit; + break; + default: + StopCantConvert(); + } + + scale_ = 1; + + switch (time_unit) { + case NANOARROW_TIME_UNIT_SECOND: + scale_ *= 1; + break; + case NANOARROW_TIME_UNIT_MILLI: + scale_ *= 1e-3; + break; + case NANOARROW_TIME_UNIT_MICRO: + scale_ *= 1e-6; + break; + case NANOARROW_TIME_UNIT_NANO: + scale_ *= 1e-9; + break; + default: + return EINVAL; + } + + return NANOARROW_OK; + } + + ArrowErrorCode PushNext(SEXP array_shelter, const ArrowArray* array, + ArrowError* error) override { + R_xlen_t value_size0 = value_size_; + NANOARROW_RETURN_NOT_OK(DblBuilder::PushNext(array_shelter, array, error)); + + if (scale_ != 1) { + double* result = REAL(value_); + for (int64_t i = 0; i < array_view_.length; i++) { + result[value_size0 + i] = result[value_size0 + i] * scale_; + } + } + + return NANOARROW_OK; + } + + private: + double scale_; +}; + +#endif diff --git a/r/src/vctr_builder_rcrd.h b/r/src/vctr_builder_rcrd.h new file mode 100644 index 000000000..d0f7f9fa9 --- /dev/null +++ b/r/src/vctr_builder_rcrd.h @@ -0,0 +1,94 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef R_NANOARROW_VCTR_BUILDER_RCRD_H_INCLUDED +#define R_NANOARROW_VCTR_BUILDER_RCRD_H_INCLUDED + +#define R_NO_REMAP +#include +#include + +#include +#include + +#include "vctr_builder_base.h" + +class RcrdBuilder : public VctrBuilder { + public: + explicit RcrdBuilder(SEXP ptype_sexp) + : VctrBuilder(VECTOR_TYPE_DATA_FRAME, ptype_sexp) {} + + ArrowErrorCode Init(const ArrowSchema* schema, VctrBuilderOptions options, + ArrowError* error) override { + // TODO: Check can convert here + + // Instantiate and initialize children + children_.resize(schema->n_children); + for (int64_t i = 0; i < schema->n_children; i++) { + SEXP child_ptype_sexp; + if (ptype_sexp_ != R_NilValue) { + child_ptype_sexp = VECTOR_ELT(ptype_sexp_, i); + } else { + child_ptype_sexp = R_NilValue; + } + + VctrBuilder* child = nullptr; + NANOARROW_RETURN_NOT_OK(InstantiateBuilder(schema->children[i], child_ptype_sexp, + options, &child, error)); + children_[i].reset(child); + NANOARROW_RETURN_NOT_OK(child->Init(schema->children[i], options, error)); + } + + schema_ = schema; + return NANOARROW_OK; + } + + SEXP GetPtype() override { + if (ptype_sexp_ != R_NilValue) { + return ptype_sexp_; + } + + SEXP result = PROTECT(Rf_allocVector(VECSXP, schema_->n_children)); + SEXP result_names = PROTECT(Rf_allocVector(STRSXP, schema_->n_children)); + for (R_xlen_t i = 0; i < schema_->n_children; i++) { + struct ArrowSchema* child = schema_->children[i]; + if (child->name != NULL) { + SET_STRING_ELT(result_names, i, Rf_mkCharCE(child->name, CE_UTF8)); + } else { + SET_STRING_ELT(result_names, i, Rf_mkChar("")); + } + + SEXP child_sexp = PROTECT(children_[i]->GetPtype()); + SET_VECTOR_ELT(result, i, child_sexp); + UNPROTECT(1); + } + + Rf_setAttrib(result, R_ClassSymbol, nanoarrow_cls_data_frame); + Rf_setAttrib(result, R_NamesSymbol, result_names); + SEXP rownames = PROTECT(Rf_allocVector(INTSXP, 2)); + INTEGER(rownames)[0] = NA_INTEGER; + INTEGER(rownames)[1] = 0; + Rf_setAttrib(result, R_RowNamesSymbol, rownames); + UNPROTECT(3); + return result; + } + + private: + std::vector> children_; +}; + +#endif diff --git a/r/src/vctr_builder_unspecified.h b/r/src/vctr_builder_unspecified.h new file mode 100644 index 000000000..1d39b2d07 --- /dev/null +++ b/r/src/vctr_builder_unspecified.h @@ -0,0 +1,80 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef R_NANOARROW_VCTR_BUILDER_UNSPECIFIED_H_INCLUDED +#define R_NANOARROW_VCTR_BUILDER_UNSPECIFIED_H_INCLUDED + +#define R_NO_REMAP +#include +#include + +#include "vctr_builder_base.h" + +class UnspecifiedBuilder : public VctrBuilder { + public: + explicit UnspecifiedBuilder(SEXP ptype_sexp) + : VctrBuilder(VECTOR_TYPE_UNSPECIFIED, ptype_sexp) {} + + ArrowErrorCode Init(const ArrowSchema* schema, VctrBuilderOptions options, + ArrowError* error) override { + NANOARROW_RETURN_NOT_OK(VctrBuilder::Init(schema, options, error)); + switch (schema_view_.type) { + case NANOARROW_TYPE_DICTIONARY: + StopCantConvert(); + default: + break; + } + + return NANOARROW_OK; + } + + ArrowErrorCode Reserve(R_xlen_t n, ArrowError* error) override { + NANOARROW_RETURN_NOT_OK(VctrBuilder::Reserve(n, error)); + SEXP value = PROTECT(Rf_allocVector(LGLSXP, n)); + SetValue(value); + UNPROTECT(1); + return NANOARROW_OK; + } + + ArrowErrorCode PushNext(SEXP array_shelter, const ArrowArray* array, + ArrowError* error) override { + int64_t not_null_count; + if (array->null_count == -1 && array->buffers[0] == nullptr) { + not_null_count = array->length; + } else if (array->null_count == -1) { + not_null_count = + ArrowBitCountSet(reinterpret_cast(array->buffers[0]), + array->offset, array->length); + } else { + not_null_count = array->length - array->null_count; + } + + if (not_null_count > 0 && array->length > 0) { + NANOARROW_RETURN_NOT_OK( + WarnLossyConvert("that were non-null set to NA", not_null_count)); + } + + int* value_ptr = LOGICAL(value_) + value_size_; + for (int64_t i = 0; i < array->length; i++) { + value_ptr[i] = NA_LOGICAL; + } + + return NANOARROW_OK; + } +}; + +#endif diff --git a/r/tools/make-callentries.R b/r/tools/make-callentries.R index 403169c3c..552ff05fb 100644 --- a/r/tools/make-callentries.R +++ b/r/tools/make-callentries.R @@ -21,7 +21,7 @@ library(tidyverse) -src_files <- list.files("src", "\\.(c|cpp)$", full.names = TRUE) %>% +src_files <- list.files("src", "\\.(c|cc)$", full.names = TRUE) %>% setdiff("src/init.c") src_sources <- src_files %>% set_names() %>% map_chr(readr::read_file)