From 9a7b20fc417294b2752373ccfba580b6dea1143d Mon Sep 17 00:00:00 2001 From: Sasun Hambardzumyan <151129343+khustup2@users.noreply.github.com> Date: Mon, 6 Jan 2025 18:43:03 +0400 Subject: [PATCH] v4.1.3 Release (#3011) --- python/deeplake/__init__.py | 2 +- python/deeplake/__init__.pyi | 180 ++++++++++++++++++++--------------- 2 files changed, 102 insertions(+), 80 deletions(-) diff --git a/python/deeplake/__init__.py b/python/deeplake/__init__.py index 538b73c46e..4e1ff03e77 100644 --- a/python/deeplake/__init__.py +++ b/python/deeplake/__init__.py @@ -14,7 +14,7 @@ def progress_bar(iterable, *args, **kwargs): import deeplake from ._deeplake import * -__version__ = "4.1.2" +__version__ = "4.1.3" __all__ = [ "__version__", diff --git a/python/deeplake/__init__.pyi b/python/deeplake/__init__.pyi index 7ee819eb5b..379449d24a 100644 --- a/python/deeplake/__init__.pyi +++ b/python/deeplake/__init__.pyi @@ -167,7 +167,7 @@ class Future: ```python async def load_data(): ds = await deeplake.open_async("s3://ml-data/images") - batch = await ds.images.get_async(slice(0, 32)) + batch = await ds["images"].get_async(slice(0, 32)) return batch ``` """ @@ -461,6 +461,7 @@ class Metadata(ReadOnlyMetadata): Writable access to dataset and column metadata for ML workflows. Stores important information about datasets like: + - Model parameters and hyperparameters - Preprocessing statistics - Data splits and fold definitions @@ -468,19 +469,29 @@ class Metadata(ReadOnlyMetadata): Changes are persisted immediately without requiring `commit()`. + + Examples: Storing model metadata: - - dataset.metadata["model_name"] = "resnet50" - dataset.metadata["hyperparameters"] = { - "learning_rate": 0.001, - "batch_size": 32 - } + ```python + ds.metadata["model_name"] = "resnet50" + ds.metadata["hyperparameters"] = { + "learning_rate": 0.001, + "batch_size": 32 + } + ``` Setting preprocessing stats: - - dataset.images.metadata["mean"] = [0.485, 0.456, 0.406] - dataset.images.metadata["std"] = [0.229, 0.224, 0.225] + ```python + ds["images"].metadata["mean"] = [0.485, 0.456, 0.406] + ds["images"].metadata["std"] = [0.229, 0.224, 0.225] + ``` """ def __setitem__(self, key: str, value: typing.Any) -> None: @@ -505,27 +516,32 @@ def query(query: str, token: str | None = None) -> DatasetView: Executes TQL queries optimized for ML data filtering and search. TQL is a SQL-like query language designed for ML datasets, supporting: - - Vector similarity search - - Text semantic search - - Complex data filtering - - Joining across datasets - - Efficient sorting and pagination + + - Vector similarity search + - Text semantic search + - Complex data filtering + - Joining across datasets + - Efficient sorting and pagination Args: query: TQL query string supporting: - - Vector similarity: COSINE_SIMILARITY, EUCLIDEAN_DISTANCE - - Text search: BM25_SIMILARITY, CONTAINS - - Filtering: WHERE clauses - - Sorting: ORDER BY - - Joins: JOIN across datasets + + - Vector similarity: COSINE_SIMILARITY, L2_NORM + - Text search: BM25_SIMILARITY, CONTAINS + - MAXSIM similarity for ColPali embeddings: MAXSIM + - Filtering: WHERE clauses + - Sorting: ORDER BY + - Joins: JOIN across datasets + token: Optional Activeloop authentication token Returns: DatasetView: Query results that can be: - - Used directly in ML training - - Further filtered with additional queries - - Converted to PyTorch/TensorFlow dataloaders - - Materialized into a new dataset + + - Used directly in ML training + - Further filtered with additional queries + - Converted to PyTorch/TensorFlow dataloaders + - Materialized into a new dataset