3636from blosc2 .scalar_array import _ScalarVarLenArray
3737from blosc2 .schema import (
3838 ListSpec ,
39+ ObjectSpec ,
3940 SchemaSpec ,
4041 StructSpec ,
4142 VLBytesSpec ,
@@ -535,7 +536,7 @@ def is_list(self) -> bool:
535536 def is_varlen_scalar (self ) -> bool :
536537 """True if this column holds variable-length scalar strings or bytes."""
537538 col = self ._table ._schema .columns_by_name .get (self ._col_name )
538- return col is not None and isinstance (col .spec , (VLStringSpec , VLBytesSpec , StructSpec ))
539+ return col is not None and isinstance (col .spec , (VLStringSpec , VLBytesSpec , StructSpec , ObjectSpec ))
539540
540541 @property
541542 def _valid_rows (self ):
@@ -1581,7 +1582,7 @@ def _is_list_column(col: CompiledColumn) -> bool:
15811582
15821583 @staticmethod
15831584 def _is_varlen_scalar_column (col : CompiledColumn ) -> bool :
1584- return isinstance (col .spec , (VLStringSpec , VLBytesSpec , StructSpec ))
1585+ return isinstance (col .spec , (VLStringSpec , VLBytesSpec , StructSpec , ObjectSpec ))
15851586
15861587 @staticmethod
15871588 def _is_list_spec (spec : SchemaSpec ) -> bool :
@@ -1644,7 +1645,7 @@ def _resolve_nullable_specs(
16441645 for col in schema .columns :
16451646 spec = col .spec
16461647 if (
1647- isinstance (spec , (ListSpec , VLStringSpec , VLBytesSpec , StructSpec ))
1648+ isinstance (spec , (ListSpec , VLStringSpec , VLBytesSpec , StructSpec , ObjectSpec ))
16481649 or getattr (spec , "null_value" , None ) is not None
16491650 ):
16501651 continue
@@ -2772,6 +2773,10 @@ def _pa_type_from_spec(pa, spec):
27722773 return pa .struct (
27732774 [pa .field (name , CTable ._pa_type_from_spec (pa , child )) for name , child in spec .fields .items ()]
27742775 )
2776+ if isinstance (spec , ObjectSpec ):
2777+ raise TypeError (
2778+ "ObjectSpec columns do not have a fixed Arrow type; materialize values explicitly"
2779+ )
27752780 if spec .to_metadata_dict ().get ("kind" ) == "bool" :
27762781 return pa .bool_ ()
27772782 dtype = getattr (spec , "dtype" , None )
@@ -2858,6 +2863,33 @@ def to_arrow(self):
28582863 def _auto_null_sentinel (pa , pa_type , * , null_policy : NullPolicy ):
28592864 return null_policy .sentinel_for_arrow_type (pa , pa_type )
28602865
2866+ @staticmethod
2867+ def _arrow_type_needs_object_fallback (pa , pa_type ) -> bool :
2868+ """True when *pa_type* has no typed CTable mapping."""
2869+ if pa_type in (
2870+ pa .int8 (),
2871+ pa .int16 (),
2872+ pa .int32 (),
2873+ pa .int64 (),
2874+ pa .uint8 (),
2875+ pa .uint16 (),
2876+ pa .uint32 (),
2877+ pa .uint64 (),
2878+ pa .float32 (),
2879+ pa .float64 (),
2880+ pa .bool_ (),
2881+ pa .string (),
2882+ pa .large_string (),
2883+ pa .utf8 (),
2884+ pa .large_utf8 (),
2885+ ):
2886+ return False
2887+ if pa .types .is_binary (pa_type ) or pa .types .is_large_binary (pa_type ):
2888+ return False
2889+ return not (
2890+ pa .types .is_list (pa_type ) or pa .types .is_large_list (pa_type ) or pa .types .is_struct (pa_type )
2891+ )
2892+
28612893 @staticmethod
28622894 def _arrow_type_to_spec ( # noqa: C901
28632895 pa ,
@@ -2867,6 +2899,7 @@ def _arrow_type_to_spec( # noqa: C901
28672899 string_max_length = None ,
28682900 null_value = None ,
28692901 nullable = False ,
2902+ object_fallback : bool = False ,
28702903 ):
28712904 import blosc2 .schema as b2s
28722905
@@ -2904,7 +2937,11 @@ def _arrow_type_to_spec( # noqa: C901
29042937 if pa_type .value_type in (pa .string (), pa .large_string (), pa .utf8 (), pa .large_utf8 ()):
29052938 item_string_max_length = max (string_max_length or 1 , 1_000_000 )
29062939 item_spec = CTable ._arrow_type_to_spec (
2907- pa , pa_type .value_type , item_arrow_col , string_max_length = item_string_max_length
2940+ pa ,
2941+ pa_type .value_type ,
2942+ item_arrow_col ,
2943+ string_max_length = item_string_max_length ,
2944+ object_fallback = object_fallback ,
29082945 )
29092946 return b2s .list (item_spec , nullable = nullable , storage = "batch" , serializer = "msgpack" )
29102947
@@ -2926,6 +2963,7 @@ def _arrow_type_to_spec( # noqa: C901
29262963 child_col ,
29272964 string_max_length = child_string_max_length ,
29282965 nullable = field .nullable ,
2966+ object_fallback = object_fallback ,
29292967 )
29302968 return b2s .struct (fields , nullable = nullable )
29312969
@@ -2943,9 +2981,14 @@ def _arrow_type_to_spec( # noqa: C901
29432981 max_length = max (string_max_length , len (null_value ) if null_value is not None else 1 , 1 )
29442982 return b2s .bytes (max_length = max_length , null_value = null_value )
29452983
2984+ if object_fallback :
2985+ return b2s .object (nullable = nullable )
2986+
29462987 raise TypeError (
29472988 f"No blosc2 spec for Arrow type { pa_type !r} . Supported: int8/16/32/64, "
2948- "uint8/16/32/64, float32/64, bool, string, binary, list, and struct."
2989+ "uint8/16/32/64, float32/64, bool, string, binary, list, and struct. "
2990+ "Pass object_fallback=True to CTable.from_arrow() to import unsupported Arrow types "
2991+ "as schema-less object columns."
29492992 )
29502993
29512994 @staticmethod
@@ -2963,6 +3006,7 @@ def _compiled_columns_from_arrow(
29633006 string_max_length ,
29643007 * ,
29653008 auto_null_sentinels : bool ,
3009+ object_fallback : bool = False ,
29663010 ):
29673011 null_policy = get_null_policy ()
29683012 column_null_values = null_policy .column_null_values
@@ -2990,9 +3034,13 @@ def _compiled_columns_from_arrow(
29903034 or pa .types .is_large_binary (field .type )
29913035 )
29923036 )
3037+ field_needs_object_fallback = cls ._arrow_type_needs_object_fallback (pa , field .type )
3038+ if field_needs_object_fallback and not object_fallback :
3039+ cls ._arrow_type_to_spec (pa , field .type , arrow_col , object_fallback = False )
3040+ field_is_object_fallback = object_fallback and field_needs_object_fallback
29933041 null_value = None
29943042 has_null_value_override = name in column_null_values
2995- if has_null_value_override and (field_is_list or field_is_struct ):
3043+ if has_null_value_override and (field_is_list or field_is_struct or field_is_object_fallback ):
29963044 raise TypeError (f"column_null_values only supports scalar columns; { name !r} is not scalar" )
29973045 if has_null_value_override and field_is_varlen_scalar :
29983046 raise TypeError (
@@ -3004,13 +3052,17 @@ def _compiled_columns_from_arrow(
30043052 elif (
30053053 auto_null_sentinels
30063054 and field .nullable
3007- and not (field_is_list or field_is_struct or field_is_varlen_scalar )
3055+ and not (
3056+ field_is_list or field_is_struct or field_is_varlen_scalar or field_is_object_fallback
3057+ )
30083058 ):
30093059 null_value = cls ._auto_null_sentinel (pa , field .type , null_policy = null_policy )
30103060 if (
30113061 arrow_col is not None
30123062 and arrow_col .null_count
3013- and not (field_is_list or field_is_struct or field_is_varlen_scalar )
3063+ and not (
3064+ field_is_list or field_is_struct or field_is_varlen_scalar or field_is_object_fallback
3065+ )
30143066 and null_value is None
30153067 ):
30163068 raise TypeError (
@@ -3024,8 +3076,11 @@ def _compiled_columns_from_arrow(
30243076 string_max_length = column_string_max_length ,
30253077 null_value = null_value ,
30263078 nullable = field .nullable ,
3079+ object_fallback = object_fallback ,
30273080 )
3028- if null_value is not None and not (field_is_list or field_is_struct or field_is_varlen_scalar ):
3081+ if null_value is not None and not (
3082+ field_is_list or field_is_struct or field_is_varlen_scalar or field_is_object_fallback
3083+ ):
30293084 cls ._validate_null_value_for_spec (name , spec , null_value )
30303085 columns .append (cls ._compiled_column_from_spec (name , spec ))
30313086 return columns
@@ -3198,6 +3253,7 @@ def from_arrow(
31983253 auto_null_sentinels : bool = True ,
31993254 blosc2_batch_size : int | None = _BATCH_SIZE_DEFAULT ,
32003255 blosc2_items_per_block : int | None = None ,
3256+ object_fallback : bool = False ,
32013257 ) -> CTable :
32023258 """Build a :class:`CTable` from an Arrow schema and iterable of record batches.
32033259
@@ -3219,9 +3275,13 @@ def from_arrow(
32193275
32203276 ``blosc2_batch_size`` controls how many rows are buffered before
32213277 BatchArray-backed imported columns (list columns and variable-length
3222- scalar columns such as ``vlstring``, ``vlbytes``, and ``struct``) are
3223- flushed to their backend. Set it to ``None`` to keep those columns
3224- pending until the final flush.
3278+ scalar columns such as ``vlstring``, ``vlbytes``, ``struct``, and
3279+ schema-less ``object`` columns) are flushed to their backend. Set it to
3280+ ``None`` to keep those columns pending until the final flush.
3281+
3282+ Unsupported Arrow types raise by default. Pass ``object_fallback=True``
3283+ to import such columns as schema-less :func:`~blosc2.object` columns.
3284+ This fallback is intentionally not used by :meth:`from_parquet`.
32253285 """
32263286 pa = cls ._require_pyarrow ("from_arrow()" )
32273287 if blosc2_batch_size is not None and blosc2_batch_size <= 0 :
@@ -3241,6 +3301,7 @@ def from_arrow(
32413301 table_for_inference ,
32423302 string_max_length ,
32433303 auto_null_sentinels = auto_null_sentinels ,
3304+ object_fallback = object_fallback ,
32443305 )
32453306 for col in columns :
32463307 if (
@@ -3327,7 +3388,9 @@ def from_parquet(
33273388 :meth:`CTable.from_arrow`, so Arrow schema handling, nullable-column support,
33283389 and Blosc2 write tuning follow the same rules as that method. Top-level
33293390 Arrow ``struct<...>`` columns are imported as :func:`~blosc2.struct`
3330- columns backed by batched variable-length storage.
3391+ columns backed by batched variable-length storage. Unsupported Parquet
3392+ types are not silently imported as schema-less :func:`~blosc2.object`
3393+ columns; they raise so callers can decide how to handle them explicitly.
33313394
33323395 Parameters
33333396 ----------
@@ -3913,7 +3976,7 @@ def _fetch_col_at_positions(self, name: str, positions: np.ndarray):
39133976 )
39143977 col = self ._cols [name ]
39153978 spec = self ._schema .columns_by_name [name ].spec
3916- if self ._is_list_spec (spec ) or isinstance (spec , (VLStringSpec , VLBytesSpec , StructSpec )):
3979+ if self ._is_list_spec (spec ) or isinstance (spec , (VLStringSpec , VLBytesSpec , StructSpec , ObjectSpec )):
39173980 return col [positions ]
39183981 return col [positions ]
39193982
@@ -4486,7 +4549,9 @@ def _normalise_sort_keys(
44864549 dtype = self ._col_dtype (name )
44874550 if dtype is None :
44884551 cc = self ._schema .columns_by_name .get (name )
4489- if cc is not None and isinstance (cc .spec , (VLStringSpec , VLBytesSpec , StructSpec )):
4552+ if cc is not None and isinstance (
4553+ cc .spec , (VLStringSpec , VLBytesSpec , StructSpec , ObjectSpec )
4554+ ):
44904555 raise TypeError (
44914556 f"Column { name !r} is a varlen scalar column and does not support sort ordering."
44924557 )
@@ -5377,10 +5442,12 @@ def create_index( # noqa: C901
53775442 col_arr = self ._cols [col_name ]
53785443 if isinstance (self ._schema .columns_by_name [col_name ].spec , ListSpec ):
53795444 raise ValueError (f"Cannot create an index on list column { col_name !r} in V1." )
5380- if isinstance (self ._schema .columns_by_name [col_name ].spec , (VLStringSpec , VLBytesSpec , StructSpec )):
5445+ if isinstance (
5446+ self ._schema .columns_by_name [col_name ].spec , (VLStringSpec , VLBytesSpec , StructSpec , ObjectSpec )
5447+ ):
53815448 raise NotImplementedError (
53825449 f"Cannot create an index on variable-length scalar column { col_name !r} : "
5383- "indexing for vlstring/vlbytes/struct columns is not supported yet."
5450+ "indexing for vlstring/vlbytes/struct/object columns is not supported yet."
53845451 )
53855452 is_persistent = self ._storage .index_anchor_path (col_name ) is not None
53865453
@@ -5771,6 +5838,8 @@ def _dtype_info_label(dtype: np.dtype | None, spec: SchemaSpec | None = None) ->
57715838 return "vlbytes"
57725839 if isinstance (spec , StructSpec ):
57735840 return spec .display_label ()
5841+ if isinstance (spec , ObjectSpec ):
5842+ return spec .display_label ()
57745843 if isinstance (spec , ListSpec ):
57755844 return spec .display_label ()
57765845 if dtype is None :
@@ -6025,7 +6094,7 @@ def _guard_varlen_scalar_expression(self, expr: str) -> None:
60256094 rf"(?<!\w){ re .escape (col .name )} (?!\w)" , expr
60266095 ):
60276096 raise NotImplementedError (
6028- f"Column { col .name !r} is a variable-length scalar column (vlstring/vlbytes/struct); "
6097+ f"Column { col .name !r} is a variable-length scalar column (vlstring/vlbytes/struct/object ); "
60296098 "lazy expressions are not supported yet."
60306099 )
60316100
0 commit comments