Skip to content

Commit

Permalink
feat(python): Add map type constructor (#687)
Browse files Browse the repository at this point in the history
When working on fixing a problem with Arrow C++'s map type import, I
realized there was no way to create map types or any of the canonical
extensions. Extensions are slightly different and I'll tackle them
later, but map types are relatively straightforward and follow the
existing pattern of all the other constructors:

```python
import nanoarrow as na
na.map_(na.string(), na.int32())
#> <Schema> map<entries: struct<key: string, value: int32>>
```
  • Loading branch information
paleolimbot authored Nov 20, 2024
1 parent 116cdad commit 253b7ec
Show file tree
Hide file tree
Showing 4 changed files with 107 additions and 11 deletions.
2 changes: 2 additions & 0 deletions python/src/nanoarrow/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
list_,
large_list,
fixed_size_list,
map_,
dictionary,
binary,
large_binary,
Expand Down Expand Up @@ -115,6 +116,7 @@
"large_string",
"large_list",
"list_",
"map_",
"null",
"nulls_as_sentinel",
"nulls_forbid",
Expand Down
8 changes: 8 additions & 0 deletions python/src/nanoarrow/_schema.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -878,6 +878,14 @@ cdef class CSchemaBuilder:

return self

def set_map_keys_sorted(self, map_keys_sorted) -> CSchemaBuilder:
if map_keys_sorted:
self._ptr.flags = self._ptr.flags | ARROW_FLAG_MAP_KEYS_SORTED
else:
self._ptr.flags = self._ptr.flags & ~ARROW_FLAG_MAP_KEYS_SORTED

return self

def validate(self) -> CSchemaView:
return CSchemaView(self.c_schema)

Expand Down
98 changes: 87 additions & 11 deletions python/src/nanoarrow/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -401,12 +401,14 @@ def dictionary_ordered(self) -> Union[bool, None]:
return self._c_schema_view.dictionary_ordered

@property
def value_type(self):
"""Dictionary or list value type
def value_type(self) -> Union["Schema", None]:
"""Dictionary, map, or list value type
>>> import nanoarrow as na
>>> na.list_(na.int32()).value_type
<Schema> 'item': int32
>>> na.map_(na.int32(), na.string()).value_type
<Schema> 'value': string
>>> na.dictionary(na.int32(), na.string()).value_type
<Schema> string
"""
Expand All @@ -416,11 +418,33 @@ def value_type(self):
_types.FIXED_SIZE_LIST,
):
return self.field(0)
elif self._c_schema_view.type_id == _types.MAP:
return Schema(self._c_schema.child(0).child(1))
elif self._c_schema_view.type_id == _types.DICTIONARY:
return Schema(self._c_schema.dictionary)
else:
return None

@property
def key_type(self) -> Union["Schema", None]:
"""Map key type
>>> import nanoarrow as na
>>> na.map_(na.int32(), na.string()).key_type
<Schema> 'key': non-nullable int32
"""
if self._c_schema_view.type_id == _types.MAP:
return Schema(self._c_schema.child(0).child(0))
else:
return None

@property
def keys_sorted(self) -> Union[bool, None]:
if self._c_schema_view.type_id == _types.MAP:
return self._c_schema_view.map_keys_sorted
else:
return None

@property
def list_size(self) -> Union[int, None]:
"""Fixed-size list element size
Expand Down Expand Up @@ -979,7 +1003,7 @@ def timestamp(
return Schema(Type.TIMESTAMP, timezone=timezone, unit=unit, nullable=nullable)


def duration(unit, nullable: bool = True):
def duration(unit, nullable: bool = True) -> Schema:
"""Create an instance of a duration type.
Parameters
Expand All @@ -999,7 +1023,7 @@ def duration(unit, nullable: bool = True):
return Schema(Type.DURATION, unit=unit, nullable=nullable)


def interval_months(nullable: bool = True):
def interval_months(nullable: bool = True) -> Schema:
"""Create an instance of an interval type measured in months.
Parameters
Expand All @@ -1017,7 +1041,7 @@ def interval_months(nullable: bool = True):
return Schema(Type.INTERVAL_MONTHS, nullable=nullable)


def interval_day_time(nullable: bool = True):
def interval_day_time(nullable: bool = True) -> Schema:
"""Create an instance of an interval type measured as a day/time pair.
Parameters
Expand All @@ -1035,7 +1059,7 @@ def interval_day_time(nullable: bool = True):
return Schema(Type.INTERVAL_DAY_TIME, nullable=nullable)


def interval_month_day_nano(nullable: bool = True):
def interval_month_day_nano(nullable: bool = True) -> Schema:
"""Create an instance of an interval type measured as a month/day/nanosecond
tuple.
Expand Down Expand Up @@ -1100,7 +1124,7 @@ def decimal256(precision: int, scale: int, nullable: bool = True) -> Schema:
return Schema(Type.DECIMAL256, precision=precision, scale=scale, nullable=nullable)


def struct(fields, nullable=True) -> Schema:
def struct(fields, nullable: bool = True) -> Schema:
"""Create a type representing a named sequence of fields.
Parameters
Expand All @@ -1124,7 +1148,7 @@ def struct(fields, nullable=True) -> Schema:
return Schema(Type.STRUCT, fields=fields, nullable=nullable)


def list_(value_type, nullable=True) -> Schema:
def list_(value_type, nullable: bool = True) -> Schema:
"""Create a type representing a variable-size list of some other type.
Parameters
Expand All @@ -1144,7 +1168,7 @@ def list_(value_type, nullable=True) -> Schema:
return Schema(Type.LIST, value_type=value_type, nullable=nullable)


def large_list(value_type, nullable=True) -> Schema:
def large_list(value_type, nullable: bool = True) -> Schema:
"""Create a type representing a variable-size list of some other type.
Unlike :func:`list_`, the func:`large_list` can accomodate arrays
Expand All @@ -1167,7 +1191,7 @@ def large_list(value_type, nullable=True) -> Schema:
return Schema(Type.LARGE_LIST, value_type=value_type, nullable=nullable)


def fixed_size_list(value_type, list_size, nullable=True) -> Schema:
def fixed_size_list(value_type, list_size: int, nullable: bool = True) -> Schema:
"""Create a type representing a fixed-size list of some other type.
Parameters
Expand All @@ -1194,7 +1218,40 @@ def fixed_size_list(value_type, list_size, nullable=True) -> Schema:
)


def dictionary(index_type, value_type, dictionary_ordered=False):
def map_(key_type, value_type, keys_sorted: bool = False, nullable: bool = True):
"""Create a type representing a list of key/value mappings
Note that each element in the list contains potentially many
key/value pairs (and that a map array contains potentially
many individual mappings).
Parameters
----------
value_type : schema-like
The type of keys in each map element.
value_type : schema-like
The type of values in each map element
keys_sorted : bool, optional
True if keys within each map element are sorted.
nullable : bool, optional
Use ``False`` to mark this field as non-nullable.
Examples
--------
>>> import nanoarrow as na
>>> na.map_(na.int32(), na.string())
<Schema> map<entries: struct<key: int32, value: string>>
"""
return Schema(
Type.MAP,
key_type=key_type,
value_type=value_type,
keys_sorted=keys_sorted,
nullable=nullable,
)


def dictionary(index_type, value_type, dictionary_ordered: bool = False) -> Schema:
"""Create a type representing dictionary-encoded values
Parameters
Expand Down Expand Up @@ -1290,6 +1347,25 @@ def _c_schema_from_type_and_params(type: Type, params: dict):
factory.allocate_children(1)
factory.set_child(0, "item", c_schema(params.pop("value_type")))

elif type == Type.MAP:
key_schema = c_schema(params.pop("key_type"))
value_schema = c_schema(params.pop("value_type"))

entries = CSchemaBuilder.allocate()
entries.set_format("+s")
entries.set_nullable(False)
entries.allocate_children(2)
entries.set_child(0, "key", key_schema.modify(nullable=False))
entries.set_child(1, "value", value_schema)

factory.set_format("+m")
factory.allocate_children(1)
factory.set_child(0, "entries", entries.finish())
factory.set_nullable(False)

if "keys_sorted" in params:
factory.set_map_keys_sorted(params.pop("keys_sorted"))

elif type == Type.DICTIONARY:
index_type = c_schema(params.pop("index_type"))
factory.set_format(index_type.format)
Expand Down
10 changes: 10 additions & 0 deletions python/tests/test_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,16 @@ def test_schema_fixed_size_list():
assert schema_obj.list_size == 123


def test_schema_map():
schema_obj = na.map_(na.int32(), na.string())
assert schema_obj.type == na.Type.MAP
assert schema_obj.key_type.type == na.Type.INT32
assert schema_obj.value_type.type == na.Type.STRING
assert schema_obj.keys_sorted is False

assert na.map_(na.int32(), na.string(), keys_sorted=True).keys_sorted is True


def test_schema_dictionary():
schema_obj = na.dictionary(na.int8(), na.null())
assert schema_obj.type == na.Type.DICTIONARY
Expand Down

0 comments on commit 253b7ec

Please sign in to comment.