Skip to content

Commit e410e5d

Browse files
committed
New blosc2.object() for declaring columns as fully general objects
1 parent e0b1b6b commit e410e5d

8 files changed

Lines changed: 246 additions & 34 deletions

File tree

doc/reference/ctable.rst

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -550,15 +550,44 @@ Text & binary
550550
vlstring
551551
vlbytes
552552
struct
553+
object
553554
list
554555

555556
.. autoclass:: string
556557
.. autoclass:: bytes
557558
.. autofunction:: vlstring
558559
.. autofunction:: vlbytes
559560
.. autofunction:: struct
561+
.. autofunction:: object
560562
.. autofunction:: list
561563

564+
Object columns
565+
--------------
566+
567+
Schema-less object columns are declared with :func:`blosc2.object` and store one
568+
msgpack-serializable Python object (or ``None`` when nullable) per row in
569+
batched variable-length storage. Prefer typed specs such as :func:`blosc2.struct`
570+
or :func:`blosc2.list` when the payload has a stable schema; use object columns
571+
for heterogeneous per-row payloads::
572+
573+
from dataclasses import dataclass
574+
import blosc2 as b2
575+
576+
@dataclass
577+
class Event:
578+
id: int = b2.field(b2.int64())
579+
payload: object = b2.field(b2.object(nullable=True))
580+
581+
table.append([1, {"kind": "click", "xy": [10, 20]}])
582+
table.append([2, ("custom", {"nested": True})])
583+
table.append([3, None])
584+
585+
Object columns have no fixed Arrow type, so :meth:`CTable.to_arrow` and
586+
:meth:`CTable.to_parquet` raise for them unless users first convert the payloads
587+
to a typed representation. They are not used as an implicit fallback during
588+
Parquet import; unsupported Arrow/Parquet types still raise unless explicitly
589+
imported through :meth:`CTable.from_arrow` with ``object_fallback=True``.
590+
562591
Struct columns
563592
--------------
564593

src/blosc2/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -748,6 +748,7 @@ def _raise(exc):
748748
int32,
749749
int64,
750750
list,
751+
object,
751752
string,
752753
struct,
753754
uint8,
@@ -791,6 +792,7 @@ def _raise(exc):
791792
"int32",
792793
"int64",
793794
"list",
795+
"object",
794796
"string",
795797
"struct",
796798
"uint8",

src/blosc2/ctable.py

Lines changed: 87 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
from blosc2.scalar_array import _ScalarVarLenArray
3737
from blosc2.schema import (
3838
ListSpec,
39+
ObjectSpec,
3940
SchemaSpec,
4041
StructSpec,
4142
VLBytesSpec,
@@ -535,7 +536,7 @@ def is_list(self) -> bool:
535536
def is_varlen_scalar(self) -> bool:
536537
"""True if this column holds variable-length scalar strings or bytes."""
537538
col = self._table._schema.columns_by_name.get(self._col_name)
538-
return col is not None and isinstance(col.spec, (VLStringSpec, VLBytesSpec, StructSpec))
539+
return col is not None and isinstance(col.spec, (VLStringSpec, VLBytesSpec, StructSpec, ObjectSpec))
539540

540541
@property
541542
def _valid_rows(self):
@@ -1581,7 +1582,7 @@ def _is_list_column(col: CompiledColumn) -> bool:
15811582

15821583
@staticmethod
15831584
def _is_varlen_scalar_column(col: CompiledColumn) -> bool:
1584-
return isinstance(col.spec, (VLStringSpec, VLBytesSpec, StructSpec))
1585+
return isinstance(col.spec, (VLStringSpec, VLBytesSpec, StructSpec, ObjectSpec))
15851586

15861587
@staticmethod
15871588
def _is_list_spec(spec: SchemaSpec) -> bool:
@@ -1644,7 +1645,7 @@ def _resolve_nullable_specs(
16441645
for col in schema.columns:
16451646
spec = col.spec
16461647
if (
1647-
isinstance(spec, (ListSpec, VLStringSpec, VLBytesSpec, StructSpec))
1648+
isinstance(spec, (ListSpec, VLStringSpec, VLBytesSpec, StructSpec, ObjectSpec))
16481649
or getattr(spec, "null_value", None) is not None
16491650
):
16501651
continue
@@ -2772,6 +2773,10 @@ def _pa_type_from_spec(pa, spec):
27722773
return pa.struct(
27732774
[pa.field(name, CTable._pa_type_from_spec(pa, child)) for name, child in spec.fields.items()]
27742775
)
2776+
if isinstance(spec, ObjectSpec):
2777+
raise TypeError(
2778+
"ObjectSpec columns do not have a fixed Arrow type; materialize values explicitly"
2779+
)
27752780
if spec.to_metadata_dict().get("kind") == "bool":
27762781
return pa.bool_()
27772782
dtype = getattr(spec, "dtype", None)
@@ -2858,6 +2863,33 @@ def to_arrow(self):
28582863
def _auto_null_sentinel(pa, pa_type, *, null_policy: NullPolicy):
28592864
return null_policy.sentinel_for_arrow_type(pa, pa_type)
28602865

2866+
@staticmethod
2867+
def _arrow_type_needs_object_fallback(pa, pa_type) -> bool:
2868+
"""True when *pa_type* has no typed CTable mapping."""
2869+
if pa_type in (
2870+
pa.int8(),
2871+
pa.int16(),
2872+
pa.int32(),
2873+
pa.int64(),
2874+
pa.uint8(),
2875+
pa.uint16(),
2876+
pa.uint32(),
2877+
pa.uint64(),
2878+
pa.float32(),
2879+
pa.float64(),
2880+
pa.bool_(),
2881+
pa.string(),
2882+
pa.large_string(),
2883+
pa.utf8(),
2884+
pa.large_utf8(),
2885+
):
2886+
return False
2887+
if pa.types.is_binary(pa_type) or pa.types.is_large_binary(pa_type):
2888+
return False
2889+
return not (
2890+
pa.types.is_list(pa_type) or pa.types.is_large_list(pa_type) or pa.types.is_struct(pa_type)
2891+
)
2892+
28612893
@staticmethod
28622894
def _arrow_type_to_spec( # noqa: C901
28632895
pa,
@@ -2867,6 +2899,7 @@ def _arrow_type_to_spec( # noqa: C901
28672899
string_max_length=None,
28682900
null_value=None,
28692901
nullable=False,
2902+
object_fallback: bool = False,
28702903
):
28712904
import blosc2.schema as b2s
28722905

@@ -2904,7 +2937,11 @@ def _arrow_type_to_spec( # noqa: C901
29042937
if pa_type.value_type in (pa.string(), pa.large_string(), pa.utf8(), pa.large_utf8()):
29052938
item_string_max_length = max(string_max_length or 1, 1_000_000)
29062939
item_spec = CTable._arrow_type_to_spec(
2907-
pa, pa_type.value_type, item_arrow_col, string_max_length=item_string_max_length
2940+
pa,
2941+
pa_type.value_type,
2942+
item_arrow_col,
2943+
string_max_length=item_string_max_length,
2944+
object_fallback=object_fallback,
29082945
)
29092946
return b2s.list(item_spec, nullable=nullable, storage="batch", serializer="msgpack")
29102947

@@ -2926,6 +2963,7 @@ def _arrow_type_to_spec( # noqa: C901
29262963
child_col,
29272964
string_max_length=child_string_max_length,
29282965
nullable=field.nullable,
2966+
object_fallback=object_fallback,
29292967
)
29302968
return b2s.struct(fields, nullable=nullable)
29312969

@@ -2943,9 +2981,14 @@ def _arrow_type_to_spec( # noqa: C901
29432981
max_length = max(string_max_length, len(null_value) if null_value is not None else 1, 1)
29442982
return b2s.bytes(max_length=max_length, null_value=null_value)
29452983

2984+
if object_fallback:
2985+
return b2s.object(nullable=nullable)
2986+
29462987
raise TypeError(
29472988
f"No blosc2 spec for Arrow type {pa_type!r}. Supported: int8/16/32/64, "
2948-
"uint8/16/32/64, float32/64, bool, string, binary, list, and struct."
2989+
"uint8/16/32/64, float32/64, bool, string, binary, list, and struct. "
2990+
"Pass object_fallback=True to CTable.from_arrow() to import unsupported Arrow types "
2991+
"as schema-less object columns."
29492992
)
29502993

29512994
@staticmethod
@@ -2963,6 +3006,7 @@ def _compiled_columns_from_arrow(
29633006
string_max_length,
29643007
*,
29653008
auto_null_sentinels: bool,
3009+
object_fallback: bool = False,
29663010
):
29673011
null_policy = get_null_policy()
29683012
column_null_values = null_policy.column_null_values
@@ -2990,9 +3034,13 @@ def _compiled_columns_from_arrow(
29903034
or pa.types.is_large_binary(field.type)
29913035
)
29923036
)
3037+
field_needs_object_fallback = cls._arrow_type_needs_object_fallback(pa, field.type)
3038+
if field_needs_object_fallback and not object_fallback:
3039+
cls._arrow_type_to_spec(pa, field.type, arrow_col, object_fallback=False)
3040+
field_is_object_fallback = object_fallback and field_needs_object_fallback
29933041
null_value = None
29943042
has_null_value_override = name in column_null_values
2995-
if has_null_value_override and (field_is_list or field_is_struct):
3043+
if has_null_value_override and (field_is_list or field_is_struct or field_is_object_fallback):
29963044
raise TypeError(f"column_null_values only supports scalar columns; {name!r} is not scalar")
29973045
if has_null_value_override and field_is_varlen_scalar:
29983046
raise TypeError(
@@ -3004,13 +3052,17 @@ def _compiled_columns_from_arrow(
30043052
elif (
30053053
auto_null_sentinels
30063054
and field.nullable
3007-
and not (field_is_list or field_is_struct or field_is_varlen_scalar)
3055+
and not (
3056+
field_is_list or field_is_struct or field_is_varlen_scalar or field_is_object_fallback
3057+
)
30083058
):
30093059
null_value = cls._auto_null_sentinel(pa, field.type, null_policy=null_policy)
30103060
if (
30113061
arrow_col is not None
30123062
and arrow_col.null_count
3013-
and not (field_is_list or field_is_struct or field_is_varlen_scalar)
3063+
and not (
3064+
field_is_list or field_is_struct or field_is_varlen_scalar or field_is_object_fallback
3065+
)
30143066
and null_value is None
30153067
):
30163068
raise TypeError(
@@ -3024,8 +3076,11 @@ def _compiled_columns_from_arrow(
30243076
string_max_length=column_string_max_length,
30253077
null_value=null_value,
30263078
nullable=field.nullable,
3079+
object_fallback=object_fallback,
30273080
)
3028-
if null_value is not None and not (field_is_list or field_is_struct or field_is_varlen_scalar):
3081+
if null_value is not None and not (
3082+
field_is_list or field_is_struct or field_is_varlen_scalar or field_is_object_fallback
3083+
):
30293084
cls._validate_null_value_for_spec(name, spec, null_value)
30303085
columns.append(cls._compiled_column_from_spec(name, spec))
30313086
return columns
@@ -3198,6 +3253,7 @@ def from_arrow(
31983253
auto_null_sentinels: bool = True,
31993254
blosc2_batch_size: int | None = _BATCH_SIZE_DEFAULT,
32003255
blosc2_items_per_block: int | None = None,
3256+
object_fallback: bool = False,
32013257
) -> CTable:
32023258
"""Build a :class:`CTable` from an Arrow schema and iterable of record batches.
32033259
@@ -3219,9 +3275,13 @@ def from_arrow(
32193275
32203276
``blosc2_batch_size`` controls how many rows are buffered before
32213277
BatchArray-backed imported columns (list columns and variable-length
3222-
scalar columns such as ``vlstring``, ``vlbytes``, and ``struct``) are
3223-
flushed to their backend. Set it to ``None`` to keep those columns
3224-
pending until the final flush.
3278+
scalar columns such as ``vlstring``, ``vlbytes``, ``struct``, and
3279+
schema-less ``object`` columns) are flushed to their backend. Set it to
3280+
``None`` to keep those columns pending until the final flush.
3281+
3282+
Unsupported Arrow types raise by default. Pass ``object_fallback=True``
3283+
to import such columns as schema-less :func:`~blosc2.object` columns.
3284+
This fallback is intentionally not used by :meth:`from_parquet`.
32253285
"""
32263286
pa = cls._require_pyarrow("from_arrow()")
32273287
if blosc2_batch_size is not None and blosc2_batch_size <= 0:
@@ -3241,6 +3301,7 @@ def from_arrow(
32413301
table_for_inference,
32423302
string_max_length,
32433303
auto_null_sentinels=auto_null_sentinels,
3304+
object_fallback=object_fallback,
32443305
)
32453306
for col in columns:
32463307
if (
@@ -3327,7 +3388,9 @@ def from_parquet(
33273388
:meth:`CTable.from_arrow`, so Arrow schema handling, nullable-column support,
33283389
and Blosc2 write tuning follow the same rules as that method. Top-level
33293390
Arrow ``struct<...>`` columns are imported as :func:`~blosc2.struct`
3330-
columns backed by batched variable-length storage.
3391+
columns backed by batched variable-length storage. Unsupported Parquet
3392+
types are not silently imported as schema-less :func:`~blosc2.object`
3393+
columns; they raise so callers can decide how to handle them explicitly.
33313394
33323395
Parameters
33333396
----------
@@ -3913,7 +3976,7 @@ def _fetch_col_at_positions(self, name: str, positions: np.ndarray):
39133976
)
39143977
col = self._cols[name]
39153978
spec = self._schema.columns_by_name[name].spec
3916-
if self._is_list_spec(spec) or isinstance(spec, (VLStringSpec, VLBytesSpec, StructSpec)):
3979+
if self._is_list_spec(spec) or isinstance(spec, (VLStringSpec, VLBytesSpec, StructSpec, ObjectSpec)):
39173980
return col[positions]
39183981
return col[positions]
39193982

@@ -4486,7 +4549,9 @@ def _normalise_sort_keys(
44864549
dtype = self._col_dtype(name)
44874550
if dtype is None:
44884551
cc = self._schema.columns_by_name.get(name)
4489-
if cc is not None and isinstance(cc.spec, (VLStringSpec, VLBytesSpec, StructSpec)):
4552+
if cc is not None and isinstance(
4553+
cc.spec, (VLStringSpec, VLBytesSpec, StructSpec, ObjectSpec)
4554+
):
44904555
raise TypeError(
44914556
f"Column {name!r} is a varlen scalar column and does not support sort ordering."
44924557
)
@@ -5377,10 +5442,12 @@ def create_index( # noqa: C901
53775442
col_arr = self._cols[col_name]
53785443
if isinstance(self._schema.columns_by_name[col_name].spec, ListSpec):
53795444
raise ValueError(f"Cannot create an index on list column {col_name!r} in V1.")
5380-
if isinstance(self._schema.columns_by_name[col_name].spec, (VLStringSpec, VLBytesSpec, StructSpec)):
5445+
if isinstance(
5446+
self._schema.columns_by_name[col_name].spec, (VLStringSpec, VLBytesSpec, StructSpec, ObjectSpec)
5447+
):
53815448
raise NotImplementedError(
53825449
f"Cannot create an index on variable-length scalar column {col_name!r}: "
5383-
"indexing for vlstring/vlbytes/struct columns is not supported yet."
5450+
"indexing for vlstring/vlbytes/struct/object columns is not supported yet."
53845451
)
53855452
is_persistent = self._storage.index_anchor_path(col_name) is not None
53865453

@@ -5771,6 +5838,8 @@ def _dtype_info_label(dtype: np.dtype | None, spec: SchemaSpec | None = None) ->
57715838
return "vlbytes"
57725839
if isinstance(spec, StructSpec):
57735840
return spec.display_label()
5841+
if isinstance(spec, ObjectSpec):
5842+
return spec.display_label()
57745843
if isinstance(spec, ListSpec):
57755844
return spec.display_label()
57765845
if dtype is None:
@@ -6025,7 +6094,7 @@ def _guard_varlen_scalar_expression(self, expr: str) -> None:
60256094
rf"(?<!\w){re.escape(col.name)}(?!\w)", expr
60266095
):
60276096
raise NotImplementedError(
6028-
f"Column {col.name!r} is a variable-length scalar column (vlstring/vlbytes/struct); "
6097+
f"Column {col.name!r} is a variable-length scalar column (vlstring/vlbytes/struct/object); "
60296098
"lazy expressions are not supported yet."
60306099
)
60316100

0 commit comments

Comments
 (0)