Skip to content

Commit bd49ded

Browse files
committed
feat: Implement LIST_VIEW and LARGE_LIST_VIEW support
1 parent 76fb7ee commit bd49ded

File tree

7 files changed

+331
-24
lines changed

7 files changed

+331
-24
lines changed

src/nanoarrow/common/array.c

Lines changed: 68 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,8 @@ static ArrowErrorCode ArrowArraySetStorageType(struct ArrowArray* array,
123123
case NANOARROW_TYPE_LARGE_STRING:
124124
case NANOARROW_TYPE_BINARY:
125125
case NANOARROW_TYPE_LARGE_BINARY:
126+
case NANOARROW_TYPE_LIST_VIEW:
127+
case NANOARROW_TYPE_LARGE_LIST_VIEW:
126128
array->n_buffers = 3;
127129
break;
128130

@@ -687,6 +689,9 @@ void ArrowArrayViewSetLength(struct ArrowArrayView* array_view, int64_t length)
687689
case NANOARROW_BUFFER_TYPE_VALIDITY:
688690
array_view->buffer_views[i].size_bytes = _ArrowBytesForBits(length);
689691
continue;
692+
case NANOARROW_BUFFER_TYPE_SIZE:
693+
array_view->buffer_views[i].size_bytes = element_size_bytes * length;
694+
continue;
690695
case NANOARROW_BUFFER_TYPE_DATA_OFFSET:
691696
// Probably don't want/need to rely on the producer to have allocated an
692697
// offsets buffer of length 1 for a zero-size array
@@ -856,11 +861,20 @@ static int ArrowArrayViewValidateMinimal(struct ArrowArrayView* array_view,
856861

857862
min_buffer_size_bytes = _ArrowBytesForBits(offset_plus_length);
858863
break;
864+
case NANOARROW_BUFFER_TYPE_SIZE:
865+
min_buffer_size_bytes = element_size_bytes * offset_plus_length;
866+
break;
859867
case NANOARROW_BUFFER_TYPE_DATA_OFFSET:
860-
// Probably don't want/need to rely on the producer to have allocated an
861-
// offsets buffer of length 1 for a zero-size array
862-
min_buffer_size_bytes =
863-
(offset_plus_length != 0) * element_size_bytes * (offset_plus_length + 1);
868+
if (array_view->storage_type == NANOARROW_TYPE_LIST_VIEW ||
869+
array_view->storage_type == NANOARROW_TYPE_LARGE_LIST_VIEW) {
870+
min_buffer_size_bytes =
871+
(offset_plus_length != 0) * element_size_bytes * offset_plus_length;
872+
} else {
873+
// Probably don't want/need to rely on the producer to have allocated an
874+
// offsets buffer of length 1 for a zero-size array
875+
min_buffer_size_bytes =
876+
(offset_plus_length != 0) * element_size_bytes * (offset_plus_length + 1);
877+
}
864878
break;
865879
case NANOARROW_BUFFER_TYPE_DATA:
866880
min_buffer_size_bytes =
@@ -898,6 +912,8 @@ static int ArrowArrayViewValidateMinimal(struct ArrowArrayView* array_view,
898912
case NANOARROW_TYPE_LARGE_LIST:
899913
case NANOARROW_TYPE_FIXED_SIZE_LIST:
900914
case NANOARROW_TYPE_MAP:
915+
case NANOARROW_TYPE_LIST_VIEW:
916+
case NANOARROW_TYPE_LARGE_LIST_VIEW:
901917
if (array_view->n_children != 1) {
902918
ArrowErrorSet(error,
903919
"Expected 1 child of %s array but found %" PRId64 " child arrays",
@@ -1132,6 +1148,10 @@ static int ArrowArrayViewValidateDefault(struct ArrowArrayView* array_view,
11321148

11331149
case NANOARROW_TYPE_LIST:
11341150
case NANOARROW_TYPE_MAP:
1151+
case NANOARROW_TYPE_LIST_VIEW: {
1152+
const size_t idx = array_view->storage_type == NANOARROW_TYPE_LIST_VIEW
1153+
? offset_plus_length - 1
1154+
: offset_plus_length;
11351155
if (array_view->buffer_views[1].size_bytes != 0) {
11361156
first_offset = array_view->buffer_views[1].data.as_int32[array_view->offset];
11371157
if (first_offset < 0) {
@@ -1140,7 +1160,7 @@ static int ArrowArrayViewValidateDefault(struct ArrowArrayView* array_view,
11401160
return EINVAL;
11411161
}
11421162

1143-
last_offset = array_view->buffer_views[1].data.as_int32[offset_plus_length];
1163+
last_offset = array_view->buffer_views[1].data.as_int32[idx];
11441164
if (last_offset < 0) {
11451165
ArrowErrorSet(error, "Expected last offset >= 0 but found %" PRId64,
11461166
last_offset);
@@ -1157,9 +1177,31 @@ static int ArrowArrayViewValidateDefault(struct ArrowArrayView* array_view,
11571177
return EINVAL;
11581178
}
11591179
}
1180+
1181+
if ((array_view->storage_type == NANOARROW_TYPE_LIST_VIEW) &&
1182+
(array_view->buffer_views[2].size_bytes != 0)) {
1183+
const int64_t first_size =
1184+
array_view->buffer_views[2].data.as_int32[array_view->offset];
1185+
if (first_size < 0) {
1186+
ArrowErrorSet(error, "Expected first size >= 0 but found %" PRId64, first_size);
1187+
return EINVAL;
1188+
}
1189+
1190+
const int64_t last_size =
1191+
array_view->buffer_views[2].data.as_int32[offset_plus_length - 1];
1192+
if (last_size < 0) {
1193+
ArrowErrorSet(error, "Expected last size >= 0 but found %" PRId64, last_size);
1194+
return EINVAL;
1195+
}
1196+
}
11601197
break;
1198+
}
11611199

11621200
case NANOARROW_TYPE_LARGE_LIST:
1201+
case NANOARROW_TYPE_LARGE_LIST_VIEW: {
1202+
const size_t idx = array_view->storage_type == NANOARROW_TYPE_LARGE_LIST_VIEW
1203+
? offset_plus_length - 1
1204+
: offset_plus_length;
11631205
if (array_view->buffer_views[1].size_bytes != 0) {
11641206
first_offset = array_view->buffer_views[1].data.as_int64[array_view->offset];
11651207
if (first_offset < 0) {
@@ -1168,7 +1210,7 @@ static int ArrowArrayViewValidateDefault(struct ArrowArrayView* array_view,
11681210
return EINVAL;
11691211
}
11701212

1171-
last_offset = array_view->buffer_views[1].data.as_int64[offset_plus_length];
1213+
last_offset = array_view->buffer_views[1].data.as_int64[idx];
11721214
if (last_offset < 0) {
11731215
ArrowErrorSet(error, "Expected last offset >= 0 but found %" PRId64,
11741216
last_offset);
@@ -1177,14 +1219,32 @@ static int ArrowArrayViewValidateDefault(struct ArrowArrayView* array_view,
11771219

11781220
if (array_view->children[0]->length < last_offset) {
11791221
ArrowErrorSet(error,
1180-
"Expected child of large list array to have length >= %" PRId64
1222+
"Expected child of %s array to have length >= %" PRId64
11811223
" but found array "
11821224
"with length %" PRId64,
1183-
last_offset, array_view->children[0]->length);
1225+
ArrowTypeString(array_view->storage_type), last_offset,
1226+
array_view->children[0]->length);
1227+
return EINVAL;
1228+
}
1229+
}
1230+
1231+
if ((array_view->storage_type == NANOARROW_TYPE_LARGE_LIST_VIEW) &&
1232+
(array_view->buffer_views[2].size_bytes != 0)) {
1233+
const int64_t first_size =
1234+
array_view->buffer_views[2].data.as_int64[array_view->offset];
1235+
if (first_size < 0) {
1236+
ArrowErrorSet(error, "Expected first size >= 0 but found %" PRId64, first_size);
1237+
return EINVAL;
1238+
}
1239+
1240+
const int64_t last_size = array_view->buffer_views[2].data.as_int64[idx];
1241+
if (last_size < 0) {
1242+
ArrowErrorSet(error, "Expected last size >= 0 but found %" PRId64, last_size);
11841243
return EINVAL;
11851244
}
11861245
}
11871246
break;
1247+
}
11881248

11891249
case NANOARROW_TYPE_RUN_END_ENCODED: {
11901250
struct ArrowArrayView* run_ends_view = array_view->children[0];

src/nanoarrow/common/array_test.cc

Lines changed: 144 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1525,7 +1525,7 @@ TEST(ArrayTest, ArrayTestAppendToLargeListArray) {
15251525
EXPECT_EQ(ArrowArrayFinishBuildingDefault(&array, &error), EINVAL);
15261526
EXPECT_STREQ(
15271527
ArrowErrorMessage(&error),
1528-
"Expected child of large list array to have length >= 3 but found array with "
1528+
"Expected child of large_list array to have length >= 3 but found array with "
15291529
"length 2");
15301530

15311531
array.children[0]->length = array.children[0]->length + 1;
@@ -1555,6 +1555,149 @@ TEST(ArrayTest, ArrayTestAppendToLargeListArray) {
15551555
#endif
15561556
}
15571557

1558+
TEST(ArrayTest, ArrayTestAppendToListViewArray) {
1559+
struct ArrowArray array;
1560+
struct ArrowSchema schema;
1561+
struct ArrowError error;
1562+
1563+
ASSERT_EQ(ArrowSchemaInitFromType(&schema, NANOARROW_TYPE_LIST_VIEW), NANOARROW_OK);
1564+
ASSERT_EQ(ArrowSchemaSetType(schema.children[0], NANOARROW_TYPE_INT64), NANOARROW_OK);
1565+
ASSERT_EQ(ArrowArrayInitFromSchema(&array, &schema, nullptr), NANOARROW_OK);
1566+
1567+
ASSERT_EQ(ArrowArrayStartAppending(&array), NANOARROW_OK);
1568+
1569+
// Check that we can reserve recursively without erroring
1570+
ASSERT_EQ(ArrowArrayReserve(&array, 5), NANOARROW_OK);
1571+
EXPECT_EQ(ArrowArrayBuffer(array.children[0], 1)->capacity_bytes, 0);
1572+
1573+
ASSERT_EQ(ArrowArrayAppendInt(array.children[0], 123), NANOARROW_OK);
1574+
EXPECT_EQ(ArrowArrayFinishElement(&array), NANOARROW_OK);
1575+
1576+
ASSERT_EQ(ArrowArrayAppendNull(&array, 1), NANOARROW_OK);
1577+
1578+
ASSERT_EQ(ArrowArrayAppendInt(array.children[0], 456), NANOARROW_OK);
1579+
ASSERT_EQ(ArrowArrayAppendInt(array.children[0], 789), NANOARROW_OK);
1580+
EXPECT_EQ(ArrowArrayFinishElement(&array), NANOARROW_OK);
1581+
1582+
EXPECT_EQ(ArrowArrayAppendEmpty(&array, 1), NANOARROW_OK);
1583+
1584+
// Make sure number of children is checked at finish
1585+
array.n_children = 0;
1586+
EXPECT_EQ(ArrowArrayFinishBuildingDefault(&array, &error), EINVAL);
1587+
EXPECT_STREQ(ArrowErrorMessage(&error),
1588+
"Expected 1 child of list_view array but found 0 child arrays");
1589+
array.n_children = 1;
1590+
1591+
// Make sure final child size is checked at finish
1592+
// TODO: this may be an expensive check with LIST_VIEW types
1593+
/*
1594+
array.children[0]->length = array.children[0]->length - 1;
1595+
EXPECT_EQ(ArrowArrayFinishBuildingDefault(&array, &error), EINVAL);
1596+
EXPECT_STREQ(
1597+
ArrowErrorMessage(&error),
1598+
"Expected child of list_view array to have length >= 3 but found array with "
1599+
"length 2");
1600+
1601+
array.children[0]->length = array.children[0]->length + 1;
1602+
*/
1603+
EXPECT_EQ(ArrowArrayFinishBuildingDefault(&array, &error), NANOARROW_OK);
1604+
1605+
#if defined(NANOARROW_BUILD_TESTS_WITH_ARROW)
1606+
auto arrow_array = ImportArray(&array, &schema);
1607+
ARROW_EXPECT_OK(arrow_array);
1608+
1609+
constexpr size_t nelems = 4;
1610+
const std::array<int32_t, nelems> offsets = {0, 0, 1, 0};
1611+
const std::array<int32_t, nelems> sizes = {1, 0, 2, 0};
1612+
const std::array<uint8_t, nelems> valid_bytes = {1, 0, 1, 1};
1613+
1614+
auto child_builder = std::make_shared<Int64Builder>();
1615+
auto builder =
1616+
ListViewBuilder(default_memory_pool(), child_builder, list_view(int64()));
1617+
ARROW_EXPECT_OK(
1618+
builder.AppendValues(offsets.data(), sizes.data(), nelems, valid_bytes.data()));
1619+
auto expected_array = builder.Finish();
1620+
ARROW_EXPECT_OK(expected_array);
1621+
1622+
EXPECT_TRUE(arrow_array.ValueUnsafe()->Equals(expected_array.ValueUnsafe()));
1623+
#else
1624+
ArrowSchemaRelease(&schema);
1625+
ArrowArrayRelease(&array);
1626+
#endif
1627+
}
1628+
1629+
TEST(ArrayTest, ArrayTestAppendToLargeListViewArray) {
1630+
struct ArrowArray array;
1631+
struct ArrowSchema schema;
1632+
struct ArrowError error;
1633+
1634+
ASSERT_EQ(ArrowSchemaInitFromType(&schema, NANOARROW_TYPE_LARGE_LIST_VIEW),
1635+
NANOARROW_OK);
1636+
ASSERT_EQ(ArrowSchemaSetType(schema.children[0], NANOARROW_TYPE_INT64), NANOARROW_OK);
1637+
ASSERT_EQ(ArrowArrayInitFromSchema(&array, &schema, nullptr), NANOARROW_OK);
1638+
1639+
ASSERT_EQ(ArrowArrayStartAppending(&array), NANOARROW_OK);
1640+
1641+
// Check that we can reserve recursively without erroring
1642+
ASSERT_EQ(ArrowArrayReserve(&array, 5), NANOARROW_OK);
1643+
EXPECT_EQ(ArrowArrayBuffer(array.children[0], 1)->capacity_bytes, 0);
1644+
1645+
ASSERT_EQ(ArrowArrayAppendInt(array.children[0], 123), NANOARROW_OK);
1646+
EXPECT_EQ(ArrowArrayFinishElement(&array), NANOARROW_OK);
1647+
1648+
ASSERT_EQ(ArrowArrayAppendNull(&array, 1), NANOARROW_OK);
1649+
1650+
ASSERT_EQ(ArrowArrayAppendInt(array.children[0], 456), NANOARROW_OK);
1651+
ASSERT_EQ(ArrowArrayAppendInt(array.children[0], 789), NANOARROW_OK);
1652+
EXPECT_EQ(ArrowArrayFinishElement(&array), NANOARROW_OK);
1653+
1654+
EXPECT_EQ(ArrowArrayAppendEmpty(&array, 1), NANOARROW_OK);
1655+
1656+
// Make sure number of children is checked at finish
1657+
array.n_children = 0;
1658+
EXPECT_EQ(ArrowArrayFinishBuildingDefault(&array, &error), EINVAL);
1659+
EXPECT_STREQ(ArrowErrorMessage(&error),
1660+
"Expected 1 child of large_list_view array but found 0 child arrays");
1661+
array.n_children = 1;
1662+
1663+
// Make sure final child size is checked at finish
1664+
// TODO: this may be an expensive check with LIST_VIEW types
1665+
/*
1666+
array.children[0]->length = array.children[0]->length - 1;
1667+
EXPECT_EQ(ArrowArrayFinishBuildingDefault(&array, &error), EINVAL);
1668+
EXPECT_STREQ(
1669+
ArrowErrorMessage(&error),
1670+
"Expected child of list_view array to have length >= 3 but found array with "
1671+
"length 2");
1672+
1673+
array.children[0]->length = array.children[0]->length + 1;
1674+
*/
1675+
EXPECT_EQ(ArrowArrayFinishBuildingDefault(&array, &error), NANOARROW_OK);
1676+
1677+
#if defined(NANOARROW_BUILD_TESTS_WITH_ARROW)
1678+
auto arrow_array = ImportArray(&array, &schema);
1679+
ARROW_EXPECT_OK(arrow_array);
1680+
1681+
constexpr size_t nelems = 4;
1682+
const std::array<int64_t, nelems> offsets = {0, 0, 1, 0};
1683+
const std::array<int64_t, nelems> sizes = {1, 0, 2, 0};
1684+
const std::array<uint8_t, nelems> valid_bytes = {1, 0, 1, 1};
1685+
1686+
auto child_builder = std::make_shared<Int64Builder>();
1687+
auto builder =
1688+
LargeListViewBuilder(default_memory_pool(), child_builder, list_view(int64()));
1689+
ARROW_EXPECT_OK(
1690+
builder.AppendValues(offsets.data(), sizes.data(), nelems, valid_bytes.data()));
1691+
auto expected_array = builder.Finish();
1692+
ARROW_EXPECT_OK(expected_array);
1693+
1694+
EXPECT_TRUE(arrow_array.ValueUnsafe()->Equals(expected_array.ValueUnsafe()));
1695+
#else
1696+
ArrowSchemaRelease(&schema);
1697+
ArrowArrayRelease(&array);
1698+
#endif
1699+
}
1700+
15581701
TEST(ArrayTest, ArrayTestAppendToMapArray) {
15591702
struct ArrowArray array;
15601703
struct ArrowSchema schema;

0 commit comments

Comments
 (0)