Skip to content

Commit

Permalink
feat: Implement LIST_VIEW and LARGE_LIST_VIEW support
Browse files Browse the repository at this point in the history
  • Loading branch information
WillAyd committed Feb 4, 2025
1 parent 76fb7ee commit bd49ded
Show file tree
Hide file tree
Showing 7 changed files with 331 additions and 24 deletions.
76 changes: 68 additions & 8 deletions src/nanoarrow/common/array.c
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,8 @@ static ArrowErrorCode ArrowArraySetStorageType(struct ArrowArray* array,
case NANOARROW_TYPE_LARGE_STRING:
case NANOARROW_TYPE_BINARY:
case NANOARROW_TYPE_LARGE_BINARY:
case NANOARROW_TYPE_LIST_VIEW:
case NANOARROW_TYPE_LARGE_LIST_VIEW:
array->n_buffers = 3;
break;

Expand Down Expand Up @@ -687,6 +689,9 @@ void ArrowArrayViewSetLength(struct ArrowArrayView* array_view, int64_t length)
case NANOARROW_BUFFER_TYPE_VALIDITY:
array_view->buffer_views[i].size_bytes = _ArrowBytesForBits(length);
continue;
case NANOARROW_BUFFER_TYPE_SIZE:
array_view->buffer_views[i].size_bytes = element_size_bytes * length;
continue;
case NANOARROW_BUFFER_TYPE_DATA_OFFSET:
// Probably don't want/need to rely on the producer to have allocated an
// offsets buffer of length 1 for a zero-size array
Expand Down Expand Up @@ -856,11 +861,20 @@ static int ArrowArrayViewValidateMinimal(struct ArrowArrayView* array_view,

min_buffer_size_bytes = _ArrowBytesForBits(offset_plus_length);
break;
case NANOARROW_BUFFER_TYPE_SIZE:
min_buffer_size_bytes = element_size_bytes * offset_plus_length;
break;
case NANOARROW_BUFFER_TYPE_DATA_OFFSET:
// Probably don't want/need to rely on the producer to have allocated an
// offsets buffer of length 1 for a zero-size array
min_buffer_size_bytes =
(offset_plus_length != 0) * element_size_bytes * (offset_plus_length + 1);
if (array_view->storage_type == NANOARROW_TYPE_LIST_VIEW ||
array_view->storage_type == NANOARROW_TYPE_LARGE_LIST_VIEW) {
min_buffer_size_bytes =
(offset_plus_length != 0) * element_size_bytes * offset_plus_length;
} else {
// Probably don't want/need to rely on the producer to have allocated an
// offsets buffer of length 1 for a zero-size array
min_buffer_size_bytes =
(offset_plus_length != 0) * element_size_bytes * (offset_plus_length + 1);
}
break;
case NANOARROW_BUFFER_TYPE_DATA:
min_buffer_size_bytes =
Expand Down Expand Up @@ -898,6 +912,8 @@ static int ArrowArrayViewValidateMinimal(struct ArrowArrayView* array_view,
case NANOARROW_TYPE_LARGE_LIST:
case NANOARROW_TYPE_FIXED_SIZE_LIST:
case NANOARROW_TYPE_MAP:
case NANOARROW_TYPE_LIST_VIEW:
case NANOARROW_TYPE_LARGE_LIST_VIEW:
if (array_view->n_children != 1) {
ArrowErrorSet(error,
"Expected 1 child of %s array but found %" PRId64 " child arrays",
Expand Down Expand Up @@ -1132,6 +1148,10 @@ static int ArrowArrayViewValidateDefault(struct ArrowArrayView* array_view,

case NANOARROW_TYPE_LIST:
case NANOARROW_TYPE_MAP:
case NANOARROW_TYPE_LIST_VIEW: {
const size_t idx = array_view->storage_type == NANOARROW_TYPE_LIST_VIEW
? offset_plus_length - 1
: offset_plus_length;
if (array_view->buffer_views[1].size_bytes != 0) {
first_offset = array_view->buffer_views[1].data.as_int32[array_view->offset];
if (first_offset < 0) {
Expand All @@ -1140,7 +1160,7 @@ static int ArrowArrayViewValidateDefault(struct ArrowArrayView* array_view,
return EINVAL;
}

last_offset = array_view->buffer_views[1].data.as_int32[offset_plus_length];
last_offset = array_view->buffer_views[1].data.as_int32[idx];
if (last_offset < 0) {
ArrowErrorSet(error, "Expected last offset >= 0 but found %" PRId64,
last_offset);
Expand All @@ -1157,9 +1177,31 @@ static int ArrowArrayViewValidateDefault(struct ArrowArrayView* array_view,
return EINVAL;
}
}

if ((array_view->storage_type == NANOARROW_TYPE_LIST_VIEW) &&
(array_view->buffer_views[2].size_bytes != 0)) {
const int64_t first_size =
array_view->buffer_views[2].data.as_int32[array_view->offset];
if (first_size < 0) {
ArrowErrorSet(error, "Expected first size >= 0 but found %" PRId64, first_size);
return EINVAL;
}

const int64_t last_size =
array_view->buffer_views[2].data.as_int32[offset_plus_length - 1];
if (last_size < 0) {
ArrowErrorSet(error, "Expected last size >= 0 but found %" PRId64, last_size);
return EINVAL;
}
}
break;
}

case NANOARROW_TYPE_LARGE_LIST:
case NANOARROW_TYPE_LARGE_LIST_VIEW: {
const size_t idx = array_view->storage_type == NANOARROW_TYPE_LARGE_LIST_VIEW
? offset_plus_length - 1
: offset_plus_length;
if (array_view->buffer_views[1].size_bytes != 0) {
first_offset = array_view->buffer_views[1].data.as_int64[array_view->offset];
if (first_offset < 0) {
Expand All @@ -1168,7 +1210,7 @@ static int ArrowArrayViewValidateDefault(struct ArrowArrayView* array_view,
return EINVAL;
}

last_offset = array_view->buffer_views[1].data.as_int64[offset_plus_length];
last_offset = array_view->buffer_views[1].data.as_int64[idx];
if (last_offset < 0) {
ArrowErrorSet(error, "Expected last offset >= 0 but found %" PRId64,
last_offset);
Expand All @@ -1177,14 +1219,32 @@ static int ArrowArrayViewValidateDefault(struct ArrowArrayView* array_view,

if (array_view->children[0]->length < last_offset) {
ArrowErrorSet(error,
"Expected child of large list array to have length >= %" PRId64
"Expected child of %s array to have length >= %" PRId64
" but found array "
"with length %" PRId64,
last_offset, array_view->children[0]->length);
ArrowTypeString(array_view->storage_type), last_offset,
array_view->children[0]->length);
return EINVAL;
}
}

if ((array_view->storage_type == NANOARROW_TYPE_LARGE_LIST_VIEW) &&
(array_view->buffer_views[2].size_bytes != 0)) {
const int64_t first_size =
array_view->buffer_views[2].data.as_int64[array_view->offset];
if (first_size < 0) {
ArrowErrorSet(error, "Expected first size >= 0 but found %" PRId64, first_size);
return EINVAL;
}

const int64_t last_size = array_view->buffer_views[2].data.as_int64[idx];
if (last_size < 0) {
ArrowErrorSet(error, "Expected last size >= 0 but found %" PRId64, last_size);
return EINVAL;
}
}
break;
}

case NANOARROW_TYPE_RUN_END_ENCODED: {
struct ArrowArrayView* run_ends_view = array_view->children[0];
Expand Down
145 changes: 144 additions & 1 deletion src/nanoarrow/common/array_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1525,7 +1525,7 @@ TEST(ArrayTest, ArrayTestAppendToLargeListArray) {
EXPECT_EQ(ArrowArrayFinishBuildingDefault(&array, &error), EINVAL);
EXPECT_STREQ(
ArrowErrorMessage(&error),
"Expected child of large list array to have length >= 3 but found array with "
"Expected child of large_list array to have length >= 3 but found array with "
"length 2");

array.children[0]->length = array.children[0]->length + 1;
Expand Down Expand Up @@ -1555,6 +1555,149 @@ TEST(ArrayTest, ArrayTestAppendToLargeListArray) {
#endif
}

TEST(ArrayTest, ArrayTestAppendToListViewArray) {
struct ArrowArray array;
struct ArrowSchema schema;
struct ArrowError error;

ASSERT_EQ(ArrowSchemaInitFromType(&schema, NANOARROW_TYPE_LIST_VIEW), NANOARROW_OK);
ASSERT_EQ(ArrowSchemaSetType(schema.children[0], NANOARROW_TYPE_INT64), NANOARROW_OK);
ASSERT_EQ(ArrowArrayInitFromSchema(&array, &schema, nullptr), NANOARROW_OK);

ASSERT_EQ(ArrowArrayStartAppending(&array), NANOARROW_OK);

// Check that we can reserve recursively without erroring
ASSERT_EQ(ArrowArrayReserve(&array, 5), NANOARROW_OK);
EXPECT_EQ(ArrowArrayBuffer(array.children[0], 1)->capacity_bytes, 0);

ASSERT_EQ(ArrowArrayAppendInt(array.children[0], 123), NANOARROW_OK);
EXPECT_EQ(ArrowArrayFinishElement(&array), NANOARROW_OK);

ASSERT_EQ(ArrowArrayAppendNull(&array, 1), NANOARROW_OK);

ASSERT_EQ(ArrowArrayAppendInt(array.children[0], 456), NANOARROW_OK);
ASSERT_EQ(ArrowArrayAppendInt(array.children[0], 789), NANOARROW_OK);
EXPECT_EQ(ArrowArrayFinishElement(&array), NANOARROW_OK);

EXPECT_EQ(ArrowArrayAppendEmpty(&array, 1), NANOARROW_OK);

// Make sure number of children is checked at finish
array.n_children = 0;
EXPECT_EQ(ArrowArrayFinishBuildingDefault(&array, &error), EINVAL);
EXPECT_STREQ(ArrowErrorMessage(&error),
"Expected 1 child of list_view array but found 0 child arrays");
array.n_children = 1;

// Make sure final child size is checked at finish
// TODO: this may be an expensive check with LIST_VIEW types
/*
array.children[0]->length = array.children[0]->length - 1;
EXPECT_EQ(ArrowArrayFinishBuildingDefault(&array, &error), EINVAL);
EXPECT_STREQ(
ArrowErrorMessage(&error),
"Expected child of list_view array to have length >= 3 but found array with "
"length 2");
array.children[0]->length = array.children[0]->length + 1;
*/
EXPECT_EQ(ArrowArrayFinishBuildingDefault(&array, &error), NANOARROW_OK);

#if defined(NANOARROW_BUILD_TESTS_WITH_ARROW)
auto arrow_array = ImportArray(&array, &schema);
ARROW_EXPECT_OK(arrow_array);

constexpr size_t nelems = 4;
const std::array<int32_t, nelems> offsets = {0, 0, 1, 0};
const std::array<int32_t, nelems> sizes = {1, 0, 2, 0};
const std::array<uint8_t, nelems> valid_bytes = {1, 0, 1, 1};

auto child_builder = std::make_shared<Int64Builder>();
auto builder =
ListViewBuilder(default_memory_pool(), child_builder, list_view(int64()));
ARROW_EXPECT_OK(
builder.AppendValues(offsets.data(), sizes.data(), nelems, valid_bytes.data()));
auto expected_array = builder.Finish();
ARROW_EXPECT_OK(expected_array);

EXPECT_TRUE(arrow_array.ValueUnsafe()->Equals(expected_array.ValueUnsafe()));
#else
ArrowSchemaRelease(&schema);
ArrowArrayRelease(&array);
#endif
}

TEST(ArrayTest, ArrayTestAppendToLargeListViewArray) {
struct ArrowArray array;
struct ArrowSchema schema;
struct ArrowError error;

ASSERT_EQ(ArrowSchemaInitFromType(&schema, NANOARROW_TYPE_LARGE_LIST_VIEW),
NANOARROW_OK);
ASSERT_EQ(ArrowSchemaSetType(schema.children[0], NANOARROW_TYPE_INT64), NANOARROW_OK);
ASSERT_EQ(ArrowArrayInitFromSchema(&array, &schema, nullptr), NANOARROW_OK);

ASSERT_EQ(ArrowArrayStartAppending(&array), NANOARROW_OK);

// Check that we can reserve recursively without erroring
ASSERT_EQ(ArrowArrayReserve(&array, 5), NANOARROW_OK);
EXPECT_EQ(ArrowArrayBuffer(array.children[0], 1)->capacity_bytes, 0);

ASSERT_EQ(ArrowArrayAppendInt(array.children[0], 123), NANOARROW_OK);
EXPECT_EQ(ArrowArrayFinishElement(&array), NANOARROW_OK);

ASSERT_EQ(ArrowArrayAppendNull(&array, 1), NANOARROW_OK);

ASSERT_EQ(ArrowArrayAppendInt(array.children[0], 456), NANOARROW_OK);
ASSERT_EQ(ArrowArrayAppendInt(array.children[0], 789), NANOARROW_OK);
EXPECT_EQ(ArrowArrayFinishElement(&array), NANOARROW_OK);

EXPECT_EQ(ArrowArrayAppendEmpty(&array, 1), NANOARROW_OK);

// Make sure number of children is checked at finish
array.n_children = 0;
EXPECT_EQ(ArrowArrayFinishBuildingDefault(&array, &error), EINVAL);
EXPECT_STREQ(ArrowErrorMessage(&error),
"Expected 1 child of large_list_view array but found 0 child arrays");
array.n_children = 1;

// Make sure final child size is checked at finish
// TODO: this may be an expensive check with LIST_VIEW types
/*
array.children[0]->length = array.children[0]->length - 1;
EXPECT_EQ(ArrowArrayFinishBuildingDefault(&array, &error), EINVAL);
EXPECT_STREQ(
ArrowErrorMessage(&error),
"Expected child of list_view array to have length >= 3 but found array with "
"length 2");
array.children[0]->length = array.children[0]->length + 1;
*/
EXPECT_EQ(ArrowArrayFinishBuildingDefault(&array, &error), NANOARROW_OK);

#if defined(NANOARROW_BUILD_TESTS_WITH_ARROW)
auto arrow_array = ImportArray(&array, &schema);
ARROW_EXPECT_OK(arrow_array);

constexpr size_t nelems = 4;
const std::array<int64_t, nelems> offsets = {0, 0, 1, 0};
const std::array<int64_t, nelems> sizes = {1, 0, 2, 0};
const std::array<uint8_t, nelems> valid_bytes = {1, 0, 1, 1};

auto child_builder = std::make_shared<Int64Builder>();
auto builder =
LargeListViewBuilder(default_memory_pool(), child_builder, list_view(int64()));
ARROW_EXPECT_OK(
builder.AppendValues(offsets.data(), sizes.data(), nelems, valid_bytes.data()));
auto expected_array = builder.Finish();
ARROW_EXPECT_OK(expected_array);

EXPECT_TRUE(arrow_array.ValueUnsafe()->Equals(expected_array.ValueUnsafe()));
#else
ArrowSchemaRelease(&schema);
ArrowArrayRelease(&array);
#endif
}

TEST(ArrayTest, ArrayTestAppendToMapArray) {
struct ArrowArray array;
struct ArrowSchema schema;
Expand Down
Loading

0 comments on commit bd49ded

Please sign in to comment.