Skip to content

Commit fe5082f

Browse files
authored
feat(python): Allow creation of dictionary and list types (#445)
This PR adds support for creating dictionary and list types: ```python import nanoarrow as na na.list_of(na.int32()) #> Schema(LIST, value_type=Schema(INT32, name='item')) na.dictionary(na.int32(), na.string()) #> Schema(DICTIONARY, index_type=Schema(INT32), value_type=Schema(STRING), dictionary_ordered=False) ``` Before, creating these types (or associated arrays from buffer) was not possible. This required some changes to `modify()` to ensure we could also set `children` and `dictionary` there.
1 parent 28e8123 commit fe5082f

9 files changed

+405
-106
lines changed

python/src/nanoarrow/__init__.py

+10-2
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343
Type,
4444
TimeUnit,
4545
null,
46-
bool,
46+
bool_,
4747
int8,
4848
uint8,
4949
int16,
@@ -57,6 +57,10 @@
5757
float64,
5858
string,
5959
large_string,
60+
list_,
61+
large_list,
62+
fixed_size_list,
63+
dictionary,
6064
binary,
6165
large_binary,
6266
fixed_size_binary,
@@ -88,7 +92,7 @@
8892
"allocate_c_array_stream",
8993
"allocate_c_schema",
9094
"binary",
91-
"bool",
95+
"bool_",
9296
"c_array",
9397
"c_array_from_buffers",
9498
"c_array_stream",
@@ -102,9 +106,11 @@
102106
"date64",
103107
"decimal128",
104108
"decimal256",
109+
"dictionary",
105110
"duration",
106111
"extension_type",
107112
"fixed_size_binary",
113+
"fixed_size_list",
108114
"float16",
109115
"float32",
110116
"float64",
@@ -117,6 +123,8 @@
117123
"interval_months",
118124
"large_binary",
119125
"large_string",
126+
"large_list",
127+
"list_",
120128
"null",
121129
"string",
122130
"struct",

python/src/nanoarrow/_lib.pyx

+64-16
Original file line numberDiff line numberDiff line change
@@ -769,23 +769,55 @@ cdef class CSchema:
769769
else:
770770
return None
771771

772-
def modify(self, *, name=None, flags=None, nullable=None, metadata=None,
773-
validate=True):
774-
builder = CSchemaBuilder.copy(self)
772+
def modify(self, *, format=None, name=None, flags=None, nullable=None,
773+
metadata=None, children=None, dictionary=None, validate=True):
774+
cdef CSchemaBuilder builder = CSchemaBuilder.allocate()
775775

776-
if name is not None:
776+
if format is None:
777+
builder.set_format(self.format)
778+
else:
779+
builder.set_format(format)
780+
781+
if name is None:
782+
builder.set_name(self.name)
783+
elif name is not False:
777784
builder.set_name(name)
778785

779-
if flags is not None:
786+
if flags is None:
787+
builder.set_flags(self.flags)
788+
else:
780789
builder.set_flags(flags)
781790

782791
if nullable is not None:
783792
builder.set_nullable(nullable)
784793

785-
if metadata is not None:
786-
builder.clear_metadata()
794+
if metadata is None:
795+
if self.metadata is not None:
796+
builder.append_metadata(self.metadata)
797+
else:
787798
builder.append_metadata(metadata)
788799

800+
if children is None:
801+
if self.n_children > 0:
802+
builder.allocate_children(self.n_children)
803+
for i, child in enumerate(self.children):
804+
builder.set_child(i, None, child)
805+
elif hasattr(children, "items"):
806+
builder.allocate_children(len(children))
807+
for i, item in enumerate(children.items()):
808+
name, child = item
809+
builder.set_child(i, name, child)
810+
else:
811+
builder.allocate_children(len(children))
812+
for i, child in enumerate(children):
813+
builder.set_child(i, None, child)
814+
815+
if dictionary is None:
816+
if self.dictionary:
817+
builder.set_dictionary(self.dictionary)
818+
elif dictionary is not False:
819+
builder.set_dictionary(dictionary)
820+
789821
if validate:
790822
builder.validate()
791823

@@ -1036,19 +1068,10 @@ cdef class CSchemaBuilder:
10361068
if self._ptr.release == NULL:
10371069
ArrowSchemaInit(self._ptr)
10381070

1039-
@staticmethod
1040-
def copy(CSchema schema):
1041-
return CSchemaBuilder(schema.__deepcopy__())
1042-
10431071
@staticmethod
10441072
def allocate():
10451073
return CSchemaBuilder(CSchema.allocate())
10461074

1047-
def clear_metadata(self):
1048-
cdef int code = ArrowSchemaSetMetadata(self.c_schema._ptr, NULL)
1049-
Error.raise_error_not_ok("ArrowSchemaSetMetadata()", code)
1050-
return self
1051-
10521075
def append_metadata(self, metadata):
10531076
cdef CBuffer buffer = CBuffer.empty()
10541077

@@ -1164,6 +1187,23 @@ cdef class CSchemaBuilder:
11641187
if name is not None:
11651188
name = str(name)
11661189
code = ArrowSchemaSetName(self._ptr.children[i], name.encode("UTF-8"))
1190+
Error.raise_error_not_ok("ArrowSchemaSetName()", code)
1191+
1192+
return self
1193+
1194+
def set_dictionary(self, CSchema dictionary):
1195+
self.c_schema._assert_valid()
1196+
1197+
cdef int code
1198+
if self._ptr.dictionary == NULL:
1199+
code = ArrowSchemaAllocateDictionary(self._ptr)
1200+
Error.raise_error_not_ok("ArrowSchemaAllocateDictionary()", code)
1201+
1202+
if self._ptr.dictionary.release != NULL:
1203+
ArrowSchemaRelease(self._ptr.dictionary)
1204+
1205+
code = ArrowSchemaDeepCopy(dictionary._ptr, self._ptr.dictionary)
1206+
Error.raise_error_not_ok("ArrowSchemaDeepCopy()", code)
11671207

11681208
return self
11691209

@@ -1179,6 +1219,14 @@ cdef class CSchemaBuilder:
11791219

11801220
return self
11811221

1222+
def set_dictionary_ordered(self, dictionary_ordered):
1223+
if dictionary_ordered:
1224+
self._ptr.flags = self._ptr.flags | ARROW_FLAG_DICTIONARY_ORDERED
1225+
else:
1226+
self._ptr.flags = self._ptr.flags & ~ARROW_FLAG_DICTIONARY_ORDERED
1227+
1228+
return self
1229+
11821230
def validate(self):
11831231
return CSchemaView(self.c_schema)
11841232

0 commit comments

Comments
 (0)