Skip to content

Commit 0fe4677

Browse files
committed
Finalize CTable varlen scalar support
- Add final validation for vlstring/vlbytes schema options - Tag BatchArray backends with ctable_varlen_scalar metadata - Validate varlen scalar backend metadata on reopen - Fix CTable constructor reopen path for vlstring/vlbytes columns - Add explicit lazy/query guard errors for varlen scalar columns - Extend vlstring/vlbytes persistence and guard tests - Confirm full pytest suite passes
1 parent efe3cc2 commit 0fe4677

5 files changed

Lines changed: 154 additions & 9 deletions

File tree

src/blosc2/ctable.py

Lines changed: 54 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -936,9 +936,19 @@ def size(self) -> int:
936936
"""Number of live values in the column."""
937937
return len(self)
938938

939+
def _ensure_queryable(self) -> None:
940+
if self.is_varlen_scalar:
941+
raise NotImplementedError(
942+
f"Column {self._col_name!r} is a vlstring/vlbytes column; "
943+
"lazy expressions and vectorized comparisons are not supported yet."
944+
)
945+
939946
@staticmethod
940947
def _unwrap_operand(other):
941-
return other._raw_col if isinstance(other, Column) else other
948+
if isinstance(other, Column):
949+
other._ensure_queryable()
950+
return other._raw_col
951+
return other
942952

943953
@property
944954
def _is_nullable_bool(self) -> bool:
@@ -950,99 +960,129 @@ def _is_nullable_bool(self) -> bool:
950960
)
951961

952962
def __neg__(self):
963+
self._ensure_queryable()
953964
return -self._raw_col
954965

955966
def __pos__(self):
967+
self._ensure_queryable()
956968
return +self._raw_col
957969

958970
def __abs__(self):
971+
self._ensure_queryable()
959972
return abs(self._raw_col)
960973

961974
def __add__(self, other):
975+
self._ensure_queryable()
962976
return self._raw_col + self._unwrap_operand(other)
963977

964978
def __radd__(self, other):
979+
self._ensure_queryable()
965980
return self._unwrap_operand(other) + self._raw_col
966981

967982
def __sub__(self, other):
983+
self._ensure_queryable()
968984
return self._raw_col - self._unwrap_operand(other)
969985

970986
def __rsub__(self, other):
987+
self._ensure_queryable()
971988
return self._unwrap_operand(other) - self._raw_col
972989

973990
def __mul__(self, other):
991+
self._ensure_queryable()
974992
return self._raw_col * self._unwrap_operand(other)
975993

976994
def __rmul__(self, other):
995+
self._ensure_queryable()
977996
return self._unwrap_operand(other) * self._raw_col
978997

979998
def __truediv__(self, other):
999+
self._ensure_queryable()
9801000
return self._raw_col / self._unwrap_operand(other)
9811001

9821002
def __rtruediv__(self, other):
1003+
self._ensure_queryable()
9831004
return self._unwrap_operand(other) / self._raw_col
9841005

9851006
def __floordiv__(self, other):
1007+
self._ensure_queryable()
9861008
return self._raw_col // self._unwrap_operand(other)
9871009

9881010
def __rfloordiv__(self, other):
1011+
self._ensure_queryable()
9891012
return self._unwrap_operand(other) // self._raw_col
9901013

9911014
def __mod__(self, other):
1015+
self._ensure_queryable()
9921016
return self._raw_col % self._unwrap_operand(other)
9931017

9941018
def __rmod__(self, other):
1019+
self._ensure_queryable()
9951020
return self._unwrap_operand(other) % self._raw_col
9961021

9971022
def __pow__(self, other):
1023+
self._ensure_queryable()
9981024
return self._raw_col ** self._unwrap_operand(other)
9991025

10001026
def __rpow__(self, other):
1027+
self._ensure_queryable()
10011028
return self._unwrap_operand(other) ** self._raw_col
10021029

10031030
def __and__(self, other):
1031+
self._ensure_queryable()
10041032
return self._raw_col & self._unwrap_operand(other)
10051033

10061034
def __rand__(self, other):
1035+
self._ensure_queryable()
10071036
return self._unwrap_operand(other) & self._raw_col
10081037

10091038
def __or__(self, other):
1039+
self._ensure_queryable()
10101040
return self._raw_col | self._unwrap_operand(other)
10111041

10121042
def __ror__(self, other):
1043+
self._ensure_queryable()
10131044
return self._unwrap_operand(other) | self._raw_col
10141045

10151046
def __xor__(self, other):
1047+
self._ensure_queryable()
10161048
return self._raw_col ^ self._unwrap_operand(other)
10171049

10181050
def __rxor__(self, other):
1051+
self._ensure_queryable()
10191052
return self._unwrap_operand(other) ^ self._raw_col
10201053

10211054
def __invert__(self):
1055+
self._ensure_queryable()
10221056
if self._is_nullable_bool:
10231057
return self._raw_col == 0
10241058
return ~self._raw_col
10251059

10261060
def __lt__(self, other):
1061+
self._ensure_queryable()
10271062
return self._raw_col < self._unwrap_operand(other)
10281063

10291064
def __le__(self, other):
1065+
self._ensure_queryable()
10301066
return self._raw_col <= self._unwrap_operand(other)
10311067

10321068
def __eq__(self, other):
1069+
self._ensure_queryable()
10331070
if self._is_nullable_bool and isinstance(other, (bool, np.bool_)):
10341071
return self._raw_col == int(other)
10351072
return self._raw_col == self._unwrap_operand(other)
10361073

10371074
def __ne__(self, other):
1075+
self._ensure_queryable()
10381076
if self._is_nullable_bool and isinstance(other, (bool, np.bool_)):
10391077
return self._raw_col == int(not other)
10401078
return self._raw_col != self._unwrap_operand(other)
10411079

10421080
def __gt__(self, other):
1081+
self._ensure_queryable()
10431082
return self._raw_col > self._unwrap_operand(other)
10441083

10451084
def __ge__(self, other):
1085+
self._ensure_queryable()
10461086
return self._raw_col >= self._unwrap_operand(other)
10471087

10481088
@property
@@ -1598,6 +1638,8 @@ def __init__(
15981638
cc = self._schema.columns_by_name[name]
15991639
if self._is_list_column(cc):
16001640
col = storage.open_list_column(name)
1641+
elif self._is_varlen_scalar_column(cc):
1642+
col = storage.open_varlen_scalar_column(name, cc.spec)
16011643
else:
16021644
col = storage.open_column(name)
16031645
self._cols[name] = col
@@ -5656,6 +5698,16 @@ def _where_expression_operands(self) -> dict[str, blosc2.NDArray | blosc2.LazyEx
56565698
operands.update({name: cc["lazy"] for name, cc in self._computed_cols.items()})
56575699
return operands
56585700

5701+
def _guard_varlen_scalar_expression(self, expr: str) -> None:
5702+
for col in self._schema.columns:
5703+
if self._is_varlen_scalar_column(col) and re.search(
5704+
rf"(?<!\w){re.escape(col.name)}(?!\w)", expr
5705+
):
5706+
raise NotImplementedError(
5707+
f"Column {col.name!r} is a vlstring/vlbytes column; "
5708+
"lazy expressions are not supported yet."
5709+
)
5710+
56595711
def where(
56605712
self,
56615713
expr_result: str | np.ndarray | blosc2.NDArray | blosc2.LazyExpr | Column,
@@ -5735,6 +5787,7 @@ def where(
57355787
t.where(not t.returned)
57365788
"""
57375789
if isinstance(expr_result, str):
5790+
self._guard_varlen_scalar_expression(expr_result)
57385791
expr_result = blosc2.lazyexpr(expr_result, self._where_expression_operands())
57395792
if isinstance(expr_result, np.ndarray) and expr_result.dtype == np.bool_:
57405793
expr_result = blosc2.asarray(expr_result)

src/blosc2/ctable_storage.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
_make_persistent_backend,
3434
_open_persistent_backend,
3535
_ScalarVarLenArray,
36+
_validate_role_metadata,
3637
)
3738
from blosc2.schunk import process_opened_object
3839

@@ -396,7 +397,8 @@ def open_varlen_scalar_column(self, name: str, spec) -> _ScalarVarLenArray:
396397
)
397398
)
398399
else:
399-
backend = _open_persistent_backend(path, self._mode)
400+
backend = _open_persistent_backend(path, self._mode, spec=spec)
401+
_validate_role_metadata(backend, spec)
400402
return _ScalarVarLenArray(spec, backend)
401403

402404
def create_valid_rows(self, *, shape, chunks, blocks):

src/blosc2/scalar_array.py

Lines changed: 46 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -33,35 +33,74 @@
3333
_CTABLE_VARLEN_SCALAR_META_KEY = "ctable_varlen_scalar"
3434

3535

36+
def _role_metadata_for_spec(spec) -> dict[str, Any]:
37+
"""Return the fixed metadata role tag for a CTable varlen scalar backend."""
38+
return {
39+
"version": 1,
40+
"py_type": "str" if spec.python_type is str else "bytes",
41+
"nullable": bool(getattr(spec, "nullable", False)),
42+
"batch_rows": getattr(spec, "batch_rows", 2048),
43+
}
44+
45+
46+
def _storage_with_role_meta(spec, **storage_kwargs: Any):
47+
import blosc2
48+
49+
storage = blosc2.Storage(**storage_kwargs)
50+
fixed_meta = dict(storage.meta or {})
51+
fixed_meta[_CTABLE_VARLEN_SCALAR_META_KEY] = _role_metadata_for_spec(spec)
52+
storage.meta = fixed_meta
53+
return storage
54+
55+
56+
def _validate_role_metadata(backend: BatchArray, spec) -> None:
57+
"""Validate the optional CTable varlen scalar role tag on an opened backend."""
58+
meta = backend.schunk.meta
59+
if _CTABLE_VARLEN_SCALAR_META_KEY not in meta:
60+
# Older local artifacts may only have the BatchArray tag; the CTable schema
61+
# still identifies the logical role, so keep reopen tolerant.
62+
return
63+
role = meta[_CTABLE_VARLEN_SCALAR_META_KEY]
64+
expected_py_type = "str" if spec.python_type is str else "bytes"
65+
if role.get("py_type") != expected_py_type:
66+
raise ValueError(
67+
f"Varlen scalar backend type mismatch: expected {expected_py_type!r}, "
68+
f"found {role.get('py_type')!r}."
69+
)
70+
71+
3672
def _make_backend(spec) -> BatchArray:
3773
"""Create a fresh in-memory BatchArray for a varlen scalar spec."""
74+
storage = _storage_with_role_meta(spec)
3875
return BatchArray(
76+
storage=storage,
3977
items_per_block=getattr(spec, "items_per_block", None),
4078
serializer=getattr(spec, "serializer", "msgpack"),
4179
)
4280

4381

4482
def _make_persistent_backend(spec, urlpath: str, mode: str, *, cparams=None, dparams=None) -> BatchArray:
4583
"""Create or open a persistent BatchArray for a varlen scalar spec."""
46-
kwargs: dict[str, Any] = {
47-
"urlpath": urlpath,
48-
"mode": mode,
49-
"contiguous": True,
50-
}
84+
kwargs: dict[str, Any] = {}
5185
if cparams is not None:
5286
kwargs["cparams"] = cparams
5387
if dparams is not None:
5488
kwargs["dparams"] = dparams
89+
storage = _storage_with_role_meta(spec, urlpath=urlpath, mode=mode, contiguous=True)
5590
return BatchArray(
91+
storage=storage,
5692
items_per_block=getattr(spec, "items_per_block", None),
5793
serializer=getattr(spec, "serializer", "msgpack"),
5894
**kwargs,
5995
)
6096

6197

62-
def _open_persistent_backend(urlpath: str, mode: str) -> BatchArray:
98+
def _open_persistent_backend(urlpath: str, mode: str, spec=None) -> BatchArray:
6399
"""Reopen an existing persistent BatchArray (any mode)."""
64-
return BatchArray(urlpath=urlpath, mode=mode)
100+
backend = BatchArray(urlpath=urlpath, mode=mode)
101+
if spec is not None:
102+
_validate_role_metadata(backend, spec)
103+
return backend
65104

66105

67106
class _ScalarVarLenArray:

src/blosc2/schema.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -515,6 +515,12 @@ def __init__(
515515
batch_rows: int | None = 2048,
516516
items_per_block: int | None = None,
517517
):
518+
if serializer != "msgpack":
519+
raise ValueError("vlstring currently only supports serializer='msgpack'")
520+
if batch_rows is not None and batch_rows <= 0:
521+
raise ValueError("batch_rows must be positive or None")
522+
if items_per_block is not None and items_per_block <= 0:
523+
raise ValueError("items_per_block must be positive or None")
518524
self.nullable = nullable
519525
self.serializer = serializer
520526
self.batch_rows = batch_rows
@@ -568,6 +574,12 @@ def __init__(
568574
batch_rows: int | None = 2048,
569575
items_per_block: int | None = None,
570576
):
577+
if serializer != "msgpack":
578+
raise ValueError("vlbytes currently only supports serializer='msgpack'")
579+
if batch_rows is not None and batch_rows <= 0:
580+
raise ValueError("batch_rows must be positive or None")
581+
if items_per_block is not None and items_per_block <= 0:
582+
raise ValueError("items_per_block must be positive or None")
571583
self.nullable = nullable
572584
self.serializer = serializer
573585
self.batch_rows = batch_rows

tests/ctable/test_vlstring_vlbytes.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,11 @@ def test_vlstring_spec_defaults():
3131
assert spec.python_type is str
3232

3333

34+
def test_vlstring_rejects_unsupported_serializer():
35+
with pytest.raises(ValueError, match="serializer='msgpack'"):
36+
blosc2.vlstring(serializer="arrow")
37+
38+
3439
def test_vlbytes_spec_defaults():
3540
spec = blosc2.vlbytes()
3641
assert spec.nullable is False
@@ -382,6 +387,32 @@ def test_ctable_vlstring_save_load(tmp_path):
382387
ct2.close()
383388

384389

390+
def test_ctable_vlstring_backend_role_metadata(tmp_path):
391+
urlpath = str(tmp_path / "vl_meta.b2d")
392+
ct = blosc2.CTable(VLRow, new_data=ROWS[:2], urlpath=urlpath, mode="w")
393+
ct.close()
394+
395+
backend = blosc2.open(str(tmp_path / "vl_meta.b2d" / "_cols" / "text.b2b"), mode="r")
396+
assert backend.schunk.meta["ctable_varlen_scalar"] == {
397+
"version": 1,
398+
"py_type": "str",
399+
"nullable": False,
400+
"batch_rows": 2048,
401+
}
402+
403+
404+
def test_ctable_constructor_reopens_vlstring_persistent_table(tmp_path):
405+
urlpath = str(tmp_path / "vl_ctor_reopen.b2d")
406+
ct = blosc2.CTable(VLRow, new_data=ROWS[:2], urlpath=urlpath, mode="w")
407+
ct.close()
408+
409+
reopened = blosc2.CTable(VLRow, urlpath=urlpath, mode="a")
410+
assert reopened.text[1] == ROWS[1][1]
411+
reopened.append((42, "ctor append", b"ctor bytes"))
412+
assert reopened.text[2] == "ctor append"
413+
reopened.close()
414+
415+
385416
def test_ctable_vlstring_save_reload_b2z(tmp_path):
386417
urlpath = str(tmp_path / "vl_test.b2z")
387418
ct = blosc2.CTable(VLRow, new_data=ROWS, urlpath=urlpath, mode="w")
@@ -479,6 +510,14 @@ def test_ctable_vlstring_sort_raises():
479510
next(gen)
480511

481512

513+
def test_ctable_vlstring_lazy_expression_raises():
514+
ct = blosc2.CTable(VLRow, new_data=ROWS)
515+
with pytest.raises(NotImplementedError, match="vlstring/vlbytes"):
516+
ct.where('text == "hello world"')
517+
with pytest.raises(NotImplementedError, match="vlstring/vlbytes"):
518+
_ = ct.text == "hello world"
519+
520+
482521
def test_ctable_vlstring_build_index_raises(tmp_path):
483522
urlpath = str(tmp_path / "vl_idx.b2d")
484523
ct = blosc2.CTable(VLRow, new_data=ROWS, urlpath=urlpath, mode="w")

0 commit comments

Comments
 (0)