Skip to content

Commit 9a00532

Browse files
authored
feat: Add ArrowArrayViewComputeNullCount (#562)
Adds `ArrowArrayViewComputeNullCount()` and tests. Extracted from #555
1 parent fcf3a80 commit 9a00532

File tree

6 files changed

+124
-18
lines changed

6 files changed

+124
-18
lines changed

python/src/nanoarrow/_lib.pyx

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -365,10 +365,7 @@ cdef class CArrayView:
365365
elif validity_bits == NULL:
366366
self._ptr.null_count = 0
367367
elif self._device is DEVICE_CPU:
368-
self._ptr.null_count = (
369-
self._ptr.length -
370-
ArrowBitCountSet(validity_bits, self.offset, self.length)
371-
)
368+
self._ptr.null_count = ArrowArrayViewComputeNullCount(self._ptr)
372369

373370
return self._ptr.null_count
374371

src/nanoarrow/common/array_test.cc

Lines changed: 92 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1816,6 +1816,97 @@ TEST(ArrayTest, ArrayViewTestBasic) {
18161816
ArrowArrayViewReset(&array_view);
18171817
}
18181818

1819+
TEST(ArrayTest, ArrayViewTestComputeNullCount) {
1820+
struct ArrowError error;
1821+
1822+
int32_t values[] = {17, 87, 23, 53};
1823+
uint8_t all_valid = 0b1111'1111;
1824+
uint8_t all_null = 0b0000'0000;
1825+
uint8_t half_valid = 0b1010'1010;
1826+
uint8_t* all_valid_because_missing = nullptr;
1827+
1828+
const void* buffers[2];
1829+
buffers[1] = values;
1830+
1831+
nanoarrow::UniqueArray array;
1832+
array->length = 4;
1833+
array->offset = 0;
1834+
array->n_buffers = 2;
1835+
array->n_children = 0;
1836+
array->buffers = buffers;
1837+
array->children = nullptr;
1838+
array->dictionary = nullptr;
1839+
array->release = [](struct ArrowArray*) {};
1840+
1841+
for (auto [buffer, null_count] : {
1842+
std::pair{&all_valid, int64_t(0)},
1843+
std::pair{&all_null, array->length},
1844+
std::pair{&half_valid, array->length / 2},
1845+
std::pair{all_valid_because_missing, int64_t(0)},
1846+
}) {
1847+
array->null_count = null_count;
1848+
buffers[0] = buffer;
1849+
nanoarrow::UniqueArrayView array_view;
1850+
ArrowArrayViewInitFromType(array_view.get(), NANOARROW_TYPE_INT32);
1851+
EXPECT_EQ(ArrowArrayViewSetArray(array_view.get(), array.get(), &error), NANOARROW_OK)
1852+
<< error.message;
1853+
EXPECT_EQ(ArrowArrayViewComputeNullCount(array_view.get()), null_count);
1854+
}
1855+
1856+
array->length = 0;
1857+
array->null_count = 0;
1858+
buffers[0] = &all_null;
1859+
nanoarrow::UniqueArrayView array_view;
1860+
ArrowArrayViewInitFromType(array_view.get(), NANOARROW_TYPE_INT32);
1861+
EXPECT_EQ(ArrowArrayViewSetArray(array_view.get(), array.get(), &error), NANOARROW_OK)
1862+
<< error.message;
1863+
EXPECT_EQ(ArrowArrayViewComputeNullCount(array_view.get()), 0);
1864+
}
1865+
1866+
TEST(ArrayTest, ArrayViewTestComputeNullCountUnion) {
1867+
struct ArrowError error;
1868+
1869+
// Build a simple union with one int and one string
1870+
nanoarrow::UniqueSchema schema;
1871+
ArrowSchemaInit(schema.get());
1872+
ASSERT_EQ(ArrowSchemaSetTypeUnion(schema.get(), NANOARROW_TYPE_DENSE_UNION, 2),
1873+
NANOARROW_OK);
1874+
ASSERT_EQ(ArrowSchemaSetType(schema->children[0], NANOARROW_TYPE_INT32), NANOARROW_OK);
1875+
ASSERT_EQ(ArrowSchemaSetType(schema->children[1], NANOARROW_TYPE_STRING), NANOARROW_OK);
1876+
1877+
nanoarrow::UniqueArray array;
1878+
ASSERT_EQ(ArrowArrayInitFromSchema(array.get(), schema.get(), &error), NANOARROW_OK);
1879+
ASSERT_EQ(ArrowArrayStartAppending(array.get()), NANOARROW_OK);
1880+
ASSERT_EQ(ArrowArrayAppendNull(array->children[0], 1), NANOARROW_OK);
1881+
ASSERT_EQ(ArrowArrayFinishUnionElement(array.get(), 0), NANOARROW_OK);
1882+
ASSERT_EQ(ArrowArrayAppendNull(array->children[1], 1), NANOARROW_OK);
1883+
ASSERT_EQ(ArrowArrayFinishUnionElement(array.get(), 1), NANOARROW_OK);
1884+
ASSERT_EQ(ArrowArrayFinishBuildingDefault(array.get(), &error), NANOARROW_OK);
1885+
1886+
nanoarrow::UniqueArrayView array_view;
1887+
ASSERT_EQ(ArrowArrayViewInitFromSchema(array_view.get(), schema.get(), &error),
1888+
NANOARROW_OK);
1889+
ASSERT_EQ(ArrowArrayViewSetArray(array_view.get(), array.get(), &error), NANOARROW_OK);
1890+
EXPECT_EQ(ArrowArrayViewComputeNullCount(array_view.get()), 0);
1891+
}
1892+
1893+
TEST(ArrayTest, ArrayViewTestComputeNullCountNull) {
1894+
struct ArrowError error;
1895+
nanoarrow::UniqueArray array;
1896+
ASSERT_EQ(ArrowArrayInitFromType(array.get(), NANOARROW_TYPE_NA), NANOARROW_OK);
1897+
1898+
EXPECT_EQ(ArrowArrayStartAppending(array.get()), NANOARROW_OK);
1899+
EXPECT_EQ(ArrowArrayAppendNull(array.get(), 11), NANOARROW_OK);
1900+
EXPECT_EQ(ArrowArrayAppendNull(array.get(), 42), NANOARROW_OK);
1901+
EXPECT_EQ(ArrowArrayFinishBuildingDefault(array.get(), &error), NANOARROW_OK)
1902+
<< error.message;
1903+
1904+
nanoarrow::UniqueArrayView array_view;
1905+
ArrowArrayViewInitFromType(array_view.get(), NANOARROW_TYPE_NA);
1906+
ASSERT_EQ(ArrowArrayViewSetArray(array_view.get(), array.get(), &error), NANOARROW_OK);
1907+
EXPECT_EQ(ArrowArrayViewComputeNullCount(array_view.get()), 53);
1908+
}
1909+
18191910
TEST(ArrayTest, ArrayViewTestMove) {
18201911
struct ArrowArrayView array_view;
18211912
ArrowArrayViewInitFromType(&array_view, NANOARROW_TYPE_STRING);
@@ -2407,7 +2498,7 @@ TEST(ArrayTest, ArrayViewTestUnionChildIndices) {
24072498
ASSERT_EQ(ArrowArrayFinishUnionElement(&array, 1), NANOARROW_OK);
24082499
ASSERT_EQ(ArrowArrayFinishBuildingDefault(&array, nullptr), NANOARROW_OK);
24092500

2410-
// The ArrayView for a union could in theroy be created without a schema.
2501+
// The ArrayView for a union could in theory be created without a schema.
24112502
// Currently FULL validation will fail here since we can't guarantee that
24122503
// these are valid.
24132504
ArrowArrayViewInitFromType(&array_view, NANOARROW_TYPE_DENSE_UNION);

src/nanoarrow/common/inline_array.h

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -740,6 +740,31 @@ static inline int8_t ArrowArrayViewIsNull(const struct ArrowArrayView* array_vie
740740
}
741741
}
742742

743+
static inline int64_t ArrowArrayViewComputeNullCount(
744+
const struct ArrowArrayView* array_view) {
745+
if (array_view->length == 0) {
746+
return 0;
747+
}
748+
749+
switch (array_view->storage_type) {
750+
case NANOARROW_TYPE_NA:
751+
return array_view->length;
752+
case NANOARROW_TYPE_DENSE_UNION:
753+
case NANOARROW_TYPE_SPARSE_UNION:
754+
// Unions are "never null" in Arrow land
755+
return 0;
756+
default:
757+
break;
758+
}
759+
760+
const uint8_t* validity_buffer = array_view->buffer_views[0].data.as_uint8;
761+
if (validity_buffer == NULL) {
762+
return 0;
763+
}
764+
return array_view->length -
765+
ArrowBitCountSet(validity_buffer, array_view->offset, array_view->length);
766+
}
767+
743768
static inline int8_t ArrowArrayViewUnionTypeId(const struct ArrowArrayView* array_view,
744769
int64_t i) {
745770
switch (array_view->storage_type) {

src/nanoarrow/nanoarrow.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1071,6 +1071,10 @@ void ArrowArrayViewReset(struct ArrowArrayView* array_view);
10711071
static inline int8_t ArrowArrayViewIsNull(const struct ArrowArrayView* array_view,
10721072
int64_t i);
10731073

1074+
/// \brief Compute null count for an ArrowArrayView
1075+
static inline int64_t ArrowArrayViewComputeNullCount(
1076+
const struct ArrowArrayView* array_view);
1077+
10741078
/// \brief Get the type id of a union array element
10751079
static inline int8_t ArrowArrayViewUnionTypeId(const struct ArrowArrayView* array_view,
10761080
int64_t i);

src/nanoarrow/nanoarrow_testing.hpp

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2013,20 +2013,8 @@ class TestingJSONReader {
20132013
ArrowBufferView* buffer_view = array_view->buffer_views + i;
20142014
buffer_view->data.as_uint8 = buffer->data;
20152015
buffer_view->size_bytes = buffer->size_bytes;
2016-
2017-
// If this is a validity buffer, set the null_count
2018-
if (array_view->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_VALIDITY &&
2019-
_ArrowBytesForBits(array_view->length) <= buffer_view->size_bytes) {
2020-
array_view->null_count =
2021-
array_view->length -
2022-
ArrowBitCountSet(buffer_view->data.as_uint8, 0, array_view->length);
2023-
}
2024-
}
2025-
2026-
// The null type doesn't have any buffers but we can set the null_count
2027-
if (array_view->storage_type == NANOARROW_TYPE_NA) {
2028-
array_view->null_count = array_view->length;
20292016
}
2017+
array_view->null_count = ArrowArrayViewComputeNullCount(array_view);
20302018

20312019
// If there is a dictionary associated with schema, parse its value into dictionary
20322020
if (schema->dictionary != nullptr) {

thirdparty/zlib/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,3 +23,4 @@ fetchcontent_makeavailable(nanoarrow_zlib)
2323

2424
add_library(ZLIB::ZLIB ALIAS zlibstatic)
2525
target_include_directories(zlibstatic INTERFACE ${zlib_BINARY_DIR} ${zlib_SOURCE_DIR})
26+
target_include_directories(zlib INTERFACE ${zlib_BINARY_DIR} ${zlib_SOURCE_DIR})

0 commit comments

Comments
 (0)