Skip to content

Commit 53208af

Browse files
authored
Fix GroupMetadata backwards compatibility (#2102)
1 parent 8ccbb1c commit 53208af

File tree

3 files changed

+150
-1
lines changed

3 files changed

+150
-1
lines changed

tiledb/cc/group.cc

+3-1
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,9 @@ void put_metadata_numpy(Group &group, const std::string &key, py::array value) {
2626
throw py::type_error("Only 1D Numpy arrays can be stored as metadata");
2727

2828
py::size_t ncells = get_ncells(value.dtype());
29-
if (ncells != 1)
29+
// we can't store multi-cell arrays as metadata
30+
// e.g. an array of strings containing strings of more than one character
31+
if (ncells != 1 && value.size() > 1)
3032
throw py::type_error("Unsupported dtype '" +
3133
std::string(py::str(value.dtype())) +
3234
"' for metadata");

tiledb/group.py

+15
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,16 @@ def __setitem__(self, key: str, value: GroupMetadataValueType):
119119
# If the value is not a 1D ndarray, store its associated shape.
120120
# The value's shape will be stored as separate metadata with the correct prefix.
121121
self.__setitem__(f"{Group._NP_SHAPE_PREFIX}{key}", value.shape)
122+
elif isinstance(value, np.generic):
123+
tiledb_type = DataType.from_numpy(value.dtype).tiledb_type
124+
if tiledb_type in (lt.DataType.BLOB, lt.DataType.CHAR):
125+
put_metadata(key, tiledb_type, len(value), value)
126+
elif tiledb_type == lt.DataType.STRING_UTF8:
127+
put_metadata(
128+
key, lt.DataType.STRING_UTF8, len(value), value.encode("UTF-8")
129+
)
130+
else:
131+
put_metadata(key, tiledb_type, 1, value)
122132
else:
123133
from .metadata import pack_metadata_val
124134

@@ -141,11 +151,16 @@ def __getitem__(self, key: str, include_type=False) -> GroupMetadataValueType:
141151

142152
if self._group._has_metadata(key):
143153
data, tdb_type = self._group._get_metadata(key, False)
154+
dtype = DataType.from_tiledb(tdb_type).np_dtype
155+
# we return all int and float values as numpy scalars
156+
if dtype.kind in ("i", "f") and not isinstance(data, tuple):
157+
data = np.dtype(dtype).type(data)
144158
elif self._group._has_metadata(f"{Group._NP_DATA_PREFIX}{key}"):
145159
data, tdb_type = self._group._get_metadata(
146160
f"{Group._NP_DATA_PREFIX}{key}", True
147161
)
148162
# reshape numpy array back to original shape, if needed
163+
# this will not be found in any case for TileDB-Py <= 0.32.3.
149164
shape_key = f"{Group._NP_SHAPE_PREFIX}{key}"
150165
if self._group._has_metadata(shape_key):
151166
shape, tdb_type = self._group._get_metadata(shape_key, False)

tiledb/tests/test_group.py

+132
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
1+
import base64
2+
import io
13
import os
24
import pathlib
5+
import tarfile
36

47
import numpy as np
58
import pytest
@@ -762,3 +765,132 @@ def test_bytes_metadata(self, capfd):
762765
grp.meta.dump()
763766
assert_captured(capfd, "Type: DataType.BLOB")
764767
grp.close()
768+
769+
def test_group_metadata_backwards_compat(self):
770+
# This test ensures that metadata written with the TileDB-Py 0.32.3
771+
# will be read correctly in the future versions.
772+
773+
# === The following code creates a group with metadata using the current version of TileDB-Py ===
774+
path_new = self.path("new_group")
775+
tiledb.Group.create(path_new)
776+
group = tiledb.Group(path_new, "w")
777+
778+
# python primitive types
779+
group.meta["python_int"] = -1234
780+
group.meta["python_float"] = 3.14
781+
group.meta["python_str"] = "hello"
782+
group.meta["python_bytes"] = b"hello"
783+
group.meta["python_bool"] = False
784+
785+
# numpy primitive types
786+
group.meta["numpy_int"] = np.int64(-93)
787+
group.meta["numpy_uint"] = np.uint64(42)
788+
group.meta["numpy_float64"] = np.float64(3.14)
789+
group.meta["numpy_bytes"] = np.bytes_("hello")
790+
group.meta["numpy_str"] = np.str_("hello")
791+
group.meta["numpy_bool"] = np.bool(False)
792+
793+
# lists/tuples
794+
group.meta["list_int"] = [7]
795+
group.meta["tuple_int"] = (7,)
796+
group.meta["list_ints"] = [1, -2, 3]
797+
group.meta["tuple_ints"] = (1, 2, 3)
798+
group.meta["list_float"] = [1.1]
799+
group.meta["tuple_float"] = (1.1,)
800+
group.meta["list_floats"] = [1.1, 2.2, 3.3]
801+
group.meta["tuple_floats"] = (1.1, 2.2, 3.3)
802+
group.meta["list_empty"] = []
803+
group.meta["tuple_empty"] = ()
804+
805+
# numpy arrays
806+
group.meta["numpy_int"] = np.array([-11], dtype=np.int64)
807+
group.meta["numpy_ints"] = np.array([1, -2, 3], dtype=np.int64)
808+
group.meta["numpy_uint"] = np.array([22], dtype=np.uint64)
809+
group.meta["numpy_uints"] = np.array([1, 2, 3], dtype=np.uint64)
810+
group.meta["numpy_float"] = np.array([3.14], dtype=np.float64)
811+
group.meta["numpy_floats"] = np.array([1.1, 2.2, 3.3], dtype=np.float64)
812+
group.meta["numpy_byte"] = np.array([b"hello"], dtype="S5")
813+
group.meta["numpy_str"] = np.array(["hello"], dtype="U5")
814+
group.meta["numpy_bool"] = np.array([True, False, True])
815+
816+
group.close()
817+
# === End of the code that creates the group with metadata ===
818+
819+
# The following commented out code was used to generate the base64 encoded string of the group
820+
# from the TileDB-Py 0.32.3 after creating the group with metadata in the exact same way as above.
821+
'''
822+
# Compress the contents of the group folder to tgz
823+
with tarfile.open("test.tar.gz", "w:gz") as tar:
824+
with os.scandir(path_new) as entries:
825+
for entry in entries:
826+
tar.add(entry.path, arcname=entry.name)
827+
828+
# Read the .tgz file and encode it to base64
829+
with open("test.tar.gz", 'rb') as f:
830+
s = base64.encodebytes(f.read())
831+
832+
# Print the base64 encoded string
833+
group_tgz = f"""{s.decode():>32}"""
834+
print(group_tgz)
835+
'''
836+
837+
# The following base64 encoded string is the contents of the group folder compressed
838+
# to a tgz file using TileDB-Py 0.32.3.
839+
group_tgz = b"""H4sICO/+G2cC/3Rlc3QudGFyANPT19N3CEis8EhNTEktYqAJMIAAXLSBgbEJgg0SNzQwMjRiUKhg
840+
oAMoLS5JLAJazzAygZGFQm5JZm6qraG5kaWFhbmlhbGekaGphbGlJRfDKBj2ID4+N7UkUZ+mdoAy
841+
tbmpKYQ2g9AGRqh53tDE3MDM3Nzc2NQcmP8NDc3NGRRM6Zn/E9Mzi/GpAypLSxt+8a83KMp/Y8zy
842+
33C0/KdL+W+Otfy3NBot/kdS+R8fj4h/YPSj8UxTktOSjQxMjNPMzS0MDCxTjVLNTUwS01IMzMxM
843+
zJMTicj/ZiYmuMp/QwNjM9Ty38jQAFhdKBjQM/+P0PJfDIhfMULYV1khNAsjTFYITDIygAQYQbKM
844+
YBYDQv0xIEcAymdEEqtgbA1x9DtsIBATrJgRpRfwgC18R8GqqqXxD1gDJwZtnTTb5YbtE0YbprhD
845+
8y0KH7SwVJTnps9d9sorMOX8Met7M8+yMHzas+bz0rgbMet7z3b75kqb3mSdtisqonQnu8GrGvHI
846+
6WGxX/Jm+7UW7V45+8/OVSZ3+O+Ic/0Sloo+8OKG6hqutaun9NgfXjqDz9ftBZNBwLvXt6+fX94/
847+
++EfK0X1S2nBpVv5jQ0cut7nS8T3/wn7rOpq5q9/Jn2XW8OhQ/frZTLrkycxHt1evlKvrtbsXeIX
848+
2dw33D0fd0yt5vqe8T/k3d3wtO4UI5Vm8yMvspXTJE+ozFY+13ZA7e+avDertDwP+b1mcjq0JPar
849+
QLS26mvFLQH6D97dDbyZlx1b8X/ZHYmHWpqMjTP6QiVvrZX/3nsqxv3WwofHjtgmbk+YGnhC/U1D
850+
v5+z0SvXZ5YfmXhYiw4Ynmi727rZteXvpZULJ/jvNikQV1/tuiM73XDytc2ZVu6PRcy4NN3Cuze9
851+
0GJc1KHr+mXOAxexJaUFAv/kVgi/K+FaI+2wZfqOxoYWocQPGzNeG9h9edh+3DfBJMYzOKL2l+em
852+
ezc0Hyq98xaQ8eT40PDoxpYX60KKnogs7Ht2d+cf9lm5m9pGy8fhDvRG+/+j/X+M9p+JqYGJ+WgD
853+
cES0/0oyc1JTkuLTi/JLC/RKUpJok//xtP+w9P+NTUD9v9H232j5P1r+D0j5b2ZoYDZa/o+I8h9c
854+
8NN0AJiM8V8TA9PR8d9RMApGwSgYBaNgFIyCUTAKRsEooCYAAP1+F2wAKAAA"""
855+
856+
# Ceate a new group by extracting the contents of the tgz file
857+
path_original = self.path("original_group")
858+
with tarfile.open(fileobj=io.BytesIO(base64.b64decode(group_tgz))) as tf:
859+
try:
860+
tf.extractall(path_original, filter="fully_trusted")
861+
except TypeError:
862+
tf.extractall(path_original)
863+
864+
# Open both the original and the new group and compare the metadata both in values and types
865+
group_original = tiledb.Group(path_original, "r")
866+
group_new = tiledb.Group(path_new, "r")
867+
868+
self.assert_metadata_roundtrip(group_new.meta, group_original.meta)
869+
870+
group_original.close()
871+
group_new.close()
872+
873+
def test_group_metadata_new_types(self):
874+
# This kind of data was not supported for TileDB-Py <= 0.32.3
875+
path_new = self.path("new_group")
876+
877+
tiledb.Group.create(path_new)
878+
group = tiledb.Group(path_new, "w")
879+
test_vals = {
880+
"int64": np.array(-1111, dtype=np.int64),
881+
"uint64": np.array(2, dtype=np.uint64),
882+
"float64": np.array(3.14, dtype=np.float64),
883+
"bool": np.array(True, dtype=bool),
884+
"str": np.array(["a", "b", "c"], dtype="S"),
885+
"unicode": np.array(["a", "b", "c"], dtype="U"),
886+
"bytes": np.array([b"a", b"b", b"c"]),
887+
"datetime": np.array(
888+
[np.datetime64("2021-01-01"), np.datetime64("2021-01-02")]
889+
),
890+
}
891+
group.meta.update(test_vals)
892+
group.close()
893+
894+
group = tiledb.Group(path_new, "r")
895+
self.assert_metadata_roundtrip(group.meta, test_vals)
896+
group.close()

0 commit comments

Comments
 (0)