Skip to content

Commit c95de36

Browse files
committed
Move fragment list consolidation API to pybind
1 parent eea0ffe commit c95de36

File tree

5 files changed

+142
-229
lines changed

5 files changed

+142
-229
lines changed

tiledb/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@
8282
from .highlevel import (
8383
array_exists,
8484
array_fragments,
85+
consolidate,
8586
empty_like,
8687
from_numpy,
8788
open,
@@ -90,7 +91,6 @@
9091
)
9192
from .libtiledb import (
9293
Array,
93-
consolidate,
9494
ls,
9595
move,
9696
object_type,

tiledb/cc/array.cc

Lines changed: 75 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,49 @@ namespace libtiledbcpp {
1313
using namespace tiledb;
1414
namespace py = pybind11;
1515

16+
void _consolidate_timestamp(const std::string &uri, const std::optional<std::string> &key, tiledb::Config *config, const Context &ctx,
17+
const std::optional<std::tuple<uint64_t, uint64_t>> &timestamp = std::nullopt) {
18+
if (timestamp.has_value()) {
19+
if (std::get<0>(*timestamp) == 0 || std::get<1>(*timestamp) == 0) {
20+
throw std::invalid_argument("'timestamp' argument expects tuple(start: int, end: int) with non-zero values");
21+
}
22+
23+
config->set("sm.consolidation.timestamp_start", std::to_string(std::get<0>(*timestamp)));
24+
config->set("sm.consolidation.timestamp_end", std::to_string(std::get<1>(*timestamp)));
25+
}
26+
27+
tiledb_encryption_type_t key_type = TILEDB_NO_ENCRYPTION;
28+
std::string key_str;
29+
30+
if (key.has_value()) {
31+
key_str = key.value();
32+
if (!key_str.empty()) {
33+
key_type = TILEDB_AES_256_GCM;
34+
}
35+
}
36+
37+
ctx.handle_error(tiledb_array_consolidate_with_key(
38+
ctx.ptr().get(), uri.c_str(), key_type, key_str.c_str(), key_str.size(), config->ptr().get()));
39+
}
40+
41+
void _consolidate_uris(const std::string &uri, const std::optional<std::string> &key, Config *config, const Context &ctx,
42+
const std::vector<std::string> &fragment_uris) {
43+
std::vector<const char *> c_strings;
44+
c_strings.reserve(fragment_uris.size());
45+
for (const auto &str : fragment_uris) {
46+
c_strings.push_back(str.c_str());
47+
}
48+
49+
if (key.has_value() && !key.value().empty()) {
50+
config->set("sm.encryption_key", key.value());
51+
}
52+
53+
ctx.handle_error(tiledb_array_consolidate_fragments(
54+
ctx.ptr().get(), uri.c_str(), c_strings.data(),
55+
fragment_uris.size(), config->ptr().get()));
56+
}
57+
58+
1659
void init_array(py::module &m) {
1760
py::class_<tiledb::Array>(m, "Array")
1861
//.def(py::init<py::object, py::object, py::iterable, py::object,
@@ -55,14 +98,38 @@ void init_array(py::module &m) {
5598
.def("config", &Array::config)
5699
.def("close", &Array::close)
57100
.def("consolidate",
58-
py::overload_cast<const Context &, const std::string &,
59-
Config *const>(&Array::consolidate),
60-
py::call_guard<py::gil_scoped_release>())
61-
.def("consolidate",
62-
py::overload_cast<const Context &, const std::string &,
63-
tiledb_encryption_type_t, const std::string &,
64-
Config *const>(&Array::consolidate),
65-
py::call_guard<py::gil_scoped_release>())
101+
[](Array &self, Config *config, const std::string &key,
102+
const std::vector<std::string> &fragment_uris,
103+
std::tuple<uint64_t, uint64_t> timestamp) {
104+
if (self.query_type() == TILEDB_READ) {
105+
throw TileDBError(
106+
"cannot consolidate array opened in readonly mode (mode='r')");
107+
}
108+
109+
tiledb::Context ctx(*config);
110+
111+
if (fragment_uris.size() > 0) {
112+
if (timestamp != std::make_tuple<uint64_t, uint64_t>(0, 0)) {
113+
PyErr_WarnEx(PyExc_DeprecationWarning,
114+
"The `timestamp` argument is deprecated; pass a list of "
115+
"fragment URIs to consolidate with `fragment_uris`", 1);
116+
}
117+
_consolidate_uris(self.uri(), key, config, ctx, fragment_uris);
118+
} else {
119+
_consolidate_timestamp(self.uri(), key, config, ctx, timestamp);
120+
}
121+
})
122+
.def_static("_consolidate_timestamp", &_consolidate_timestamp)
123+
.def_static("_consolidate_uris", &_consolidate_uris)
124+
// .def("consolidate",
125+
// py::overload_cast<const Context &, const std::string &,
126+
// Config *const>(&Array::consolidate),
127+
// py::call_guard<py::gil_scoped_release>())
128+
// .def("consolidate",
129+
// py::overload_cast<const Context &, const std::string &,
130+
// tiledb_encryption_type_t, const std::string &,
131+
// Config *const>(&Array::consolidate),
132+
// py::call_guard<py::gil_scoped_release>())
66133
//(void (Array::*)(const Context&, const std::string&,
67134
// tiledb_encryption_type_t, const std::string&,
68135
// Config* const)&Array::consolidate)&Array::consolidate)

tiledb/highlevel.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
1+
import warnings
2+
13
import numpy as np
24

35
import tiledb
6+
import tiledb.cc as lt
47

58
from .dataframe_ import create_dim
69

@@ -195,6 +198,69 @@ def array_fragments(uri, include_mbrs=False, ctx=None):
195198
return tiledb.FragmentInfoList(uri, include_mbrs, ctx)
196199

197200

201+
def consolidate(
202+
uri, key=None, config=None, ctx=None, fragment_uris=None, timestamp=None
203+
):
204+
"""Consolidates TileDB array fragments for improved read performance
205+
206+
:param str uri: URI to the TileDB Array
207+
:param str key: (default None) Key to decrypt array if the array is encrypted
208+
:param tiledb.Config config: The TileDB Config with consolidation parameters set
209+
:param tiledb.Ctx ctx: (default None) The TileDB Context
210+
:param fragment_uris: (default None) Consolidate the array using a list of fragment file names
211+
:param timestamp: (default None) If not None, consolidate the array using the given tuple(int, int) UNIX seconds range (inclusive). This argument will be ignored if `fragment_uris` is passed.
212+
:rtype: str or bytes
213+
:return: path (URI) to the consolidated TileDB Array
214+
:raises TypeError: cannot convert path to unicode string
215+
:raises: :py:exc:`tiledb.TileDBError`
216+
217+
Rather than passing the timestamp into this function, it may be set with
218+
the config parameters `"sm.vacuum.timestamp_start"`and
219+
`"sm.vacuum.timestamp_end"` which takes in a time in UNIX seconds. If both
220+
are set then this function's `timestamp` argument will be used.
221+
222+
**Example:**
223+
224+
>>> import tiledb, tempfile, numpy as np, os
225+
>>> path = tempfile.mkdtemp()
226+
227+
>>> with tiledb.from_numpy(path, np.zeros(4), timestamp=1) as A:
228+
... pass
229+
>>> with tiledb.open(path, 'w', timestamp=2) as A:
230+
... A[:] = np.ones(4, dtype=np.int64)
231+
>>> with tiledb.open(path, 'w', timestamp=3) as A:
232+
... A[:] = np.ones(4, dtype=np.int64)
233+
>>> with tiledb.open(path, 'w', timestamp=4) as A:
234+
... A[:] = np.ones(4, dtype=np.int64)
235+
>>> len(tiledb.array_fragments(path))
236+
4
237+
238+
>>> fragment_names = [
239+
... os.path.basename(f) for f in tiledb.array_fragments(path).uri
240+
... ]
241+
>>> array_uri = tiledb.consolidate(
242+
... path, fragment_uris=[fragment_names[1], fragment_names[3]]
243+
... )
244+
>>> len(tiledb.array_fragments(path))
245+
3
246+
247+
"""
248+
ctx = _get_ctx(ctx)
249+
if config is None:
250+
config = tiledb.Config(ctx.config())
251+
252+
if fragment_uris is not None:
253+
if timestamp is not None:
254+
warnings.warn(
255+
"The `timestamp` argument will be ignored and only fragments "
256+
"passed to `fragment_uris` will be consolidate",
257+
DeprecationWarning,
258+
)
259+
return lt.Array._consolidate_uris(uri, key, config, ctx, fragment_uris)
260+
else:
261+
return lt.Array._consolidate_timestamp(uri, key, config, ctx, timestamp)
262+
263+
198264
def schema_like(*args, shape=None, dtype=None, ctx=None, **kwargs):
199265
"""
200266
Return an ArraySchema corresponding to a NumPy-like object or

tiledb/libmetadata.pyx

Lines changed: 0 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -385,54 +385,6 @@ cdef class Metadata:
385385

386386
return bool(has_key)
387387

388-
def consolidate(self):
389-
"""
390-
Consolidate array metadata. Array must be closed.
391-
392-
:return:
393-
"""
394-
# TODO: ensure that the array is not x-locked?
395-
ctx = (<Array?> self.array).ctx
396-
config = ctx.config()
397-
cdef:
398-
uint32_t rc = 0
399-
tiledb_ctx_t* ctx_ptr = <tiledb_ctx_t*>PyCapsule_GetPointer(
400-
ctx.__capsule__(), "ctx")
401-
tiledb_config_t* config_ptr = NULL
402-
tiledb_encryption_type_t key_type = TILEDB_NO_ENCRYPTION
403-
void* key_ptr = NULL
404-
uint32_t key_len = 0
405-
bytes bkey
406-
bytes buri = unicode_path(self.array.uri)
407-
str key = (<Array?>self.array).key
408-
409-
if config:
410-
config_ptr = <tiledb_config_t*>PyCapsule_GetPointer(
411-
config.__capsule__(), "config")
412-
413-
if key is not None:
414-
if isinstance(key, str):
415-
bkey = key.encode('ascii')
416-
else:
417-
bkey = bytes(self.array.key)
418-
key_type = TILEDB_AES_256_GCM
419-
key_ptr = <void *> PyBytes_AS_STRING(bkey)
420-
#TODO: unsafe cast here ssize_t -> uint64_t
421-
key_len = <uint32_t> PyBytes_GET_SIZE(bkey)
422-
423-
cdef const char* buri_ptr = <const char*>buri
424-
425-
with nogil:
426-
rc = tiledb_array_consolidate_with_key(
427-
ctx_ptr,
428-
buri_ptr,
429-
key_type,
430-
key_ptr,
431-
key_len,
432-
config_ptr)
433-
if rc != TILEDB_OK:
434-
_raise_ctx_err(ctx_ptr, rc)
435-
436388
get = MutableMapping.get
437389
update = MutableMapping.update
438390

0 commit comments

Comments
 (0)