From aa16ba0f9a3696036010c35b06c7a2866623ed3c Mon Sep 17 00:00:00 2001 From: AbhinavTuli Date: Mon, 8 Mar 2021 14:41:40 +0530 Subject: [PATCH 1/7] improvements to to_tensorflow, bbox, classlabels --- benchmarks/benchmark_sequential_write.py | 2 +- hub/api/dataset.py | 5 +- hub/api/datasetview.py | 8 ++- hub/api/integrations.py | 17 +++++ hub/api/tensorview.py | 13 +++- hub/api/tests/test_dataset.py | 34 +++------- hub/compute/ray.py | 2 +- hub/compute/transform.py | 4 +- hub/schema/bbox.py | 30 ++++++++- hub/schema/class_label.py | 23 +++++-- hub/schema/deserialize.py | 6 ++ hub/schema/sequence.py | 4 +- hub/schema/tests/test_features.py | 80 +++++++++++++++++++++++- 13 files changed, 180 insertions(+), 48 deletions(-) diff --git a/benchmarks/benchmark_sequential_write.py b/benchmarks/benchmark_sequential_write.py index f5bd3f0c76..6f86999da6 100644 --- a/benchmarks/benchmark_sequential_write.py +++ b/benchmarks/benchmark_sequential_write.py @@ -33,7 +33,7 @@ def time_batches(dataset, batch_size=1, num_batches=1, hub=False): dataset["label"][ batch * batch_size : (batch + 1) * batch_size ] = np.random.randint(10, size=(batch_size, 1)) - dataset.commit() + dataset.flush() counter += 1 t1 = time() print("Batch", counter, f"dt: {t1 - t0}") diff --git a/hub/api/dataset.py b/hub/api/dataset.py index 5b8b719560..2c0f5d4feb 100644 --- a/hub/api/dataset.py +++ b/hub/api/dataset.py @@ -584,9 +584,8 @@ def to_pytorch( ds = _to_pytorch(self, transform, inplace, output_type, indexes) return ds - def to_tensorflow(self, indexes=None, include_shapes=False): + def to_tensorflow(self, indexes=None, include_shapes=False, repeat=False): """| Converts the dataset into a tensorflow compatible format - Parameters ---------- indexes: list or int, optional @@ -597,7 +596,7 @@ def to_tensorflow(self, indexes=None, include_shapes=False): """ from .integrations import _to_tensorflow - ds = _to_tensorflow(self, indexes, include_shapes) + ds = _to_tensorflow(self, indexes, include_shapes, repeat=repeat) return ds def _get_dictionary(self, subpath, slice_=None): diff --git a/hub/api/datasetview.py b/hub/api/datasetview.py index 17418d5043..3bec7990e4 100644 --- a/hub/api/datasetview.py +++ b/hub/api/datasetview.py @@ -249,7 +249,7 @@ def __str__(self): def __repr__(self): return self.__str__() - def to_tensorflow(self, include_shapes): + def to_tensorflow(self, include_shapes=False, repeat=False): """|Converts the dataset into a tensorflow compatible format Parameters @@ -260,7 +260,7 @@ def to_tensorflow(self, include_shapes): """ return self.dataset.to_tensorflow( - indexes=self.indexes, include_shapes=include_shapes + indexes=self.indexes, include_shapes=include_shapes, repeat=repeat ) def to_pytorch( @@ -295,6 +295,10 @@ def commit(self) -> None: """Commit dataset""" self.dataset.commit() + def flush(self) -> None: + """Flush dataset""" + self.dataset.flush() + def numpy(self, label_name=False): """Gets the value from different tensorview objects in the datasetview schema diff --git a/hub/api/integrations.py b/hub/api/integrations.py index 38c00a69da..06458c703c 100644 --- a/hub/api/integrations.py +++ b/hub/api/integrations.py @@ -185,6 +185,7 @@ def _get_active_item(key, index): return _active_chunks[key][index % samples_per_chunk] def tf_gen(): + key_dtype_map = {key: dataset[key, indexes[0]].dtype for key in dataset.keys} for index in indexes: d = {} for key in dataset.keys: @@ -197,6 +198,15 @@ def tf_gen(): cur[split_key[i]] = {} cur = cur[split_key[i]] cur[split_key[-1]] = _get_active_item(key, index) + if isinstance(key_dtype_map[key], Text): + value = cur[split_key[-1]] + if value.ndim == 1: + value = "".join(chr(it) for it in value.tolist()) + elif value.ndim == 2: + value = [ + "".join(chr(it) for it in val.tolist()) for val in value + ] + cur[split_key[-1]] = value yield (d) def dict_to_tf(my_dtype): @@ -208,9 +218,14 @@ def dict_to_tf(my_dtype): def tensor_to_tf(my_dtype): return dtype_to_tf(my_dtype.dtype) + def text_to_tf(my_dtype): + return "string" + def dtype_to_tf(my_dtype): if isinstance(my_dtype, SchemaDict): return dict_to_tf(my_dtype) + elif isinstance(my_dtype, Text): + return text_to_tf(my_dtype) elif isinstance(my_dtype, Tensor): return tensor_to_tf(my_dtype) elif isinstance(my_dtype, Primitive): @@ -221,6 +236,8 @@ def dtype_to_tf(my_dtype): def get_output_shapes(my_dtype): if isinstance(my_dtype, SchemaDict): return output_shapes_from_dict(my_dtype) + elif isinstance(my_dtype, Text): + return () elif isinstance(my_dtype, Tensor): return my_dtype.shape elif isinstance(my_dtype, Primitive): diff --git a/hub/api/tensorview.py b/hub/api/tensorview.py index 5769924ccb..0f33a0bda0 100644 --- a/hub/api/tensorview.py +++ b/hub/api/tensorview.py @@ -98,9 +98,18 @@ def numpy(self, label_name=False): if isinstance(self.dtype, hub.schema.class_label.ClassLabel) and label_name: if isinstance(self.indexes, int): - value = self.dtype.int2str(value) + if value.ndim == 0: + value = self.dtype.int2str(value) + elif value.ndim == 1: + value = [self.dtype.int2str(value[i]) for i in range(value.size)] else: - value = [self.dtype.int2str(value[i]) for i in range(value.size)] + if value.ndim == 1: + value = [self.dtype.int2str(value[i]) for i in range(value.size)] + elif value.ndim == 2: + value = [ + [self.dtype.int2str(item[i]) for i in range(item.size)] + for item in value + ] if isinstance(self.dtype, hub.schema.text.Text): if self.dataset.tokenizer is not None: diff --git a/hub/api/tests/test_dataset.py b/hub/api/tests/test_dataset.py index 6cce2bd951..469bb299ad 100644 --- a/hub/api/tests/test_dataset.py +++ b/hub/api/tests/test_dataset.py @@ -56,10 +56,7 @@ def test_dataset_append_and_read(): shutil.rmtree("./data/test/test_dataset_append_and_read") ds = Dataset( - schema=dt, - shape=(2,), - url="./data/test/test_dataset_append_and_read", - mode="a", + schema=dt, shape=(2,), url="./data/test/test_dataset_append_and_read", mode="a", ) ds["first"][0] = 2.3 @@ -68,10 +65,7 @@ def test_dataset_append_and_read(): assert ds["second"][0].numpy() != 2.3 ds.close() - ds = Dataset( - url="./data/test/test_dataset_append_and_read", - mode="r", - ) + ds = Dataset(url="./data/test/test_dataset_append_and_read", mode="r",) assert ds.meta_information["description"] == "This is my description" ds.meta_information["hello"] = 5 ds.delete() @@ -160,19 +154,10 @@ def test_dataset_with_chunks(): def test_pickleability(url="./data/test/test_dataset_dynamic_shaped"): schema = { "first": Tensor( - shape=(None, None), - dtype="int32", - max_shape=(100, 100), - chunks=(100,), + shape=(None, None), dtype="int32", max_shape=(100, 100), chunks=(100,), ) } - ds = Dataset( - url=url, - token=None, - shape=(1000,), - mode="w", - schema=schema, - ) + ds = Dataset(url=url, token=None, shape=(1000,), mode="w", schema=schema,) ds["first"][0] = np.ones((10, 10)) @@ -194,10 +179,7 @@ def test_pickleability_gcs(): def test_dataset_dynamic_shaped(): schema = { "first": Tensor( - shape=(None, None), - dtype="int32", - max_shape=(100, 100), - chunks=(100,), + shape=(None, None), dtype="int32", max_shape=(100, 100), chunks=(100,), ) } ds = Dataset( @@ -554,7 +536,7 @@ def test_append_dataset(): assert ds["first"].shape[0] == 120 assert ds["first", 5:10].shape[0] == 5 assert ds["second"].shape[0] == 120 - ds.commit() + ds.flush() ds = Dataset(url) assert ds["first"].shape[0] == 120 @@ -751,9 +733,7 @@ def my_transform(annotation): assert ds["a", 30].compute() == np.array([0.2]) ds2 = Dataset(schema=my_schema, url="./data/casting3", shape=(100,)) - ds2["a", 0:100] = np.ones( - 100, - ) + ds2["a", 0:100] = np.ones(100,) assert ds2["a", 30].compute() == np.array([1]) diff --git a/hub/compute/ray.py b/hub/compute/ray.py index f602551079..817da70a3d 100644 --- a/hub/compute/ray.py +++ b/hub/compute/ray.py @@ -225,7 +225,7 @@ def upload( results = ray.get(tasks) self.set_dynamic_shapes(results, ds) - ds.commit() + ds.flush() return ds def set_dynamic_shapes(self, results, ds): diff --git a/hub/compute/transform.py b/hub/compute/transform.py index 86d1530f92..5f38a392b9 100644 --- a/hub/compute/transform.py +++ b/hub/compute/transform.py @@ -288,7 +288,7 @@ def upload_chunk(i_batch): [slice(offset, offset + len(value))], value ) - ds.commit() + ds.flush() return ds def call_func(self, fn_index, item, as_list=False): @@ -439,7 +439,7 @@ def batchify_generator(iterator: Iterable, size: int): start += n_results ds_out.resize_shape(total) - ds_out.commit() + ds_out.flush() return ds_out @property diff --git a/hub/schema/bbox.py b/hub/schema/bbox.py index 2285babb1b..435603e0fa 100644 --- a/hub/schema/bbox.py +++ b/hub/schema/bbox.py @@ -16,11 +16,26 @@ class BBox(Tensor): normalized coordinates of the bounding box `[xmin, ymin, xmax, ymax]` """ - def __init__(self, dtype="float64", chunks=None, compressor="lz4"): + def __init__( + self, + shape: Tuple[int, ...] = (4,), + max_shape: Tuple[int, ...] = None, + dtype="float64", + chunks=None, + compressor="lz4", + ): """Construct the connector. Parameters ---------- + shape: tuple of ints or None + The shape of bounding box. + Will be (4,) if only one bounding box corresponding to each sample. + If N bboxes corresponding to each sample, shape should be (N,) + If the number of bboxes for each sample vary from 0 to M. The shape should be set to (None, 4) and max_shape should be set to (M, 4) + Defaults to (4,). + max_shape : Tuple[int], optional + Maximum shape of BBox dtype : str dtype of bbox coordinates. Default: 'float32' chunks : Tuple[int] | True @@ -29,8 +44,13 @@ def __init__(self, dtype="float64", chunks=None, compressor="lz4"): Sample Count is also in the list of tensor's dimensions (first dimension) If default value is chosen, automatically detects how to split into chunks """ + self.check_shape(shape) super(BBox, self).__init__( - shape=(4,), dtype=dtype, chunks=chunks, compressor=compressor + shape=shape, + max_shape=max_shape, + dtype=dtype, + chunks=chunks, + compressor=compressor, ) def __str__(self): @@ -40,3 +60,9 @@ def __str__(self): def __repr__(self): return self.__str__() + + def check_shape(self, shape): + if len(shape) not in [1, 2] or shape[-1] != 4: + raise ValueError( + "Wrong BBox shape provided, should be of the format (4,) or (None, 4) or (N, 4)" + ) diff --git a/hub/schema/class_label.py b/hub/schema/class_label.py index b03cabe921..ce4aa17ccc 100644 --- a/hub/schema/class_label.py +++ b/hub/schema/class_label.py @@ -4,7 +4,7 @@ If a copy of the MPL was not distributed with this file, You can obtain one at https://mozilla.org/MPL/2.0/. """ -from typing import List +from typing import List, Tuple from hub.schema.features import Tensor @@ -18,6 +18,8 @@ class ClassLabel(Tensor): def __init__( self, + shape: Tuple[int, ...] = (), + max_shape: Tuple[int, ...] = None, num_classes: int = None, names: List[str] = None, names_file: str = None, @@ -40,15 +42,21 @@ def __init__( Parameters ---------- + shape: tuple of ints or None + The shape of classlabel. + Will be () if only one classbabel corresponding to each sample. + If N classlabels corresponding to each sample, shape should be (N,) + If the number of classlabels for each sample vary from 0 to M. The shape should be set to (None,) and max_shape should be set to (M,) + Defaults to (). + max_shape : Tuple[int], optional + Maximum shape of ClassLabel num_classes: `int` number of classes. All labels must be < num_classes. names: `list` string names for the integer classes. The order in which the names are provided is kept. names_file: `str` path to a file with names for the integer classes, one per line. - max_shape : Tuple[int] - Maximum shape of tensor shape if tensor is dynamic - chunks : Tuple[int] | True + chunks : Tuple[int] | True, optional Describes how to split tensor dimensions into chunks (files) to store them efficiently. It is anticipated that each file should be ~16MB. Sample Count is also in the list of tensor's dimensions (first dimension) @@ -61,6 +69,7 @@ def __init__( ---------- ValueError: If more than one argument is provided """ + self.check_shape(shape) super().__init__( shape=(), dtype="int64", @@ -158,3 +167,9 @@ def __str__(self): def __repr__(self): return self.__str__() + + def check_shape(self, shape): + if len(shape) not in [0, 1]: + raise ValueError( + "Wrong ClassLabel shape provided, should be of the format () or (None,) or (N,)" + ) diff --git a/hub/schema/deserialize.py b/hub/schema/deserialize.py index 3be18c4f49..c254650ab2 100644 --- a/hub/schema/deserialize.py +++ b/hub/schema/deserialize.py @@ -35,22 +35,28 @@ def deserialize(inp): ) elif inp["type"] == "BBox": return BBox( + shape=tuple(inp["shape"]), dtype=deserialize(inp["dtype"]), chunks=inp["chunks"], compressor=_get_compressor(inp), + max_shape=tuple(inp["max_shape"]), ) elif inp["type"] == "ClassLabel": if inp["_names"] is not None: return ClassLabel( + shape=tuple(inp["shape"]), names=inp["_names"], chunks=inp["chunks"], compressor=_get_compressor(inp), + max_shape=tuple(inp["max_shape"]), ) else: return ClassLabel( + shape=tuple(inp["shape"]), num_classes=inp["_num_classes"], chunks=inp["chunks"], compressor=_get_compressor(inp), + max_shape=tuple(inp["max_shape"]), ) elif inp["type"] == "SchemaDict" or inp["type"] == "FeatureDict": d = {} diff --git a/hub/schema/sequence.py b/hub/schema/sequence.py index 280eb92415..4e06ef7b0f 100644 --- a/hub/schema/sequence.py +++ b/hub/schema/sequence.py @@ -12,12 +12,12 @@ class Sequence(Tensor): At generation time, a list for each of the sequence element is given. The output of `Dataset` will batch all the elements of the sequence together. If the length of the sequence is static and known in advance, it should be - specified in the constructor using the `length` param. + specified in the constructor using the `shape` param. | Usage: ---------- - >>> sequence = Sequence(Image(), length=NB_FRAME) + >>> sequence = Sequence(shape=(5,), dtype = Image((100, 100, 3))) """ def __init__( diff --git a/hub/schema/tests/test_features.py b/hub/schema/tests/test_features.py index 5eb3784178..e88d23c1ec 100644 --- a/hub/schema/tests/test_features.py +++ b/hub/schema/tests/test_features.py @@ -15,6 +15,8 @@ from hub.schema.class_label import ClassLabel, _load_names_from_file from hub.schema.features import HubSchema, SchemaDict, Tensor import pytest +from hub import Dataset +import numpy as np def test_hub_feature_flatten(): @@ -94,6 +96,60 @@ def test_class_label(): cl2.names = ["ab", "cd", "ef", "gh"] +def test_class_label_2(): + cl1 = ClassLabel(names=["apple", "banana", "cat"]) + cl2 = ClassLabel((None,), (10,), names=["apple", "banana", "cat"]) + cl3 = ClassLabel((3,), names=["apple", "banana", "cat"]) + my_schema = {"cl1": cl1, "cl2": cl2, "cl3": cl3} + + ds = Dataset("./data/cl_2d_3d", schema=my_schema, shape=(10), mode="w") + + ds["cl1", 0] = cl1.str2int("cat") + ds["cl1", 1] = cl1.str2int("apple") + ds["cl1", 2] = cl1.str2int("apple") + ds["cl1", 3:5] = [cl1.str2int("banana"), cl1.str2int("banana")] + assert ds["cl1", 1].compute(True) == "apple" + assert ds["cl1", 0:3].compute(True) == ["cat", "apple", "apple"] + assert ds["cl1", 3:5].compute(True) == ["banana", "banana"] + + ds["cl2", 0] = np.array( + [cl2.str2int("cat"), cl2.str2int("cat"), cl2.str2int("apple")] + ) + ds["cl2", 1] = np.array([cl2.str2int("apple"), cl2.str2int("banana")]) + ds["cl2", 2] = np.array( + [ + cl2.str2int("cat"), + cl2.str2int("apple"), + cl2.str2int("banana"), + cl2.str2int("apple"), + cl2.str2int("banana"), + ] + ) + ds["cl2", 3] = np.array([cl2.str2int("cat")]) + assert ds["cl2", 0].compute(True) == ["cat", "cat", "apple"] + assert ds["cl2", 1].compute(True) == ["apple", "banana"] + assert ds["cl2", 2].compute(True) == ["cat", "apple", "banana", "apple", "banana"] + assert ds["cl2", 3].compute(True) == ["cat"] + + ds["cl3", 0] = np.array( + [cl3.str2int("apple"), cl3.str2int("apple"), cl3.str2int("apple")] + ) + ds["cl3", 1] = np.array( + [cl3.str2int("banana"), cl3.str2int("banana"), cl3.str2int("banana")] + ) + ds["cl3", 2] = np.array( + [cl3.str2int("cat"), cl3.str2int("cat"), cl3.str2int("cat")] + ) + assert ds["cl3", 0].compute(True) == ["apple", "apple", "apple"] + assert ds["cl3", 1].compute(True) == ["banana", "banana", "banana"] + assert ds["cl3", 2].compute(True) == ["cat", "cat", "cat"] + assert ds["cl3", 0:3].compute(True) == [ + ["apple", "apple", "apple"], + ["banana", "banana", "banana"], + ["cat", "cat", "cat"], + ] + + def test_polygon(): with pytest.raises(ValueError): poly1 = Polygon(shape=(11, 3)) @@ -101,6 +157,26 @@ def test_polygon(): poly2 = Polygon(shape=(11, 4, 2)) +def test_bbox_shape(): + with pytest.raises(ValueError): + bb1 = BBox(shape=(11, 3)) + with pytest.raises(ValueError): + bb2 = BBox(shape=(11, 4, 2)) + bb3 = BBox(shape=(None, 4), max_shape=(10, 4)) + bb4 = BBox(shape=(4,)) + bb4 = BBox(shape=(5, 4)) + + +def test_classlabel_shape(): + with pytest.raises(ValueError): + cl1 = ClassLabel(shape=(11, 3)) + with pytest.raises(ValueError): + cl2 = ClassLabel(shape=(11, 4, 2)) + cl3 = ClassLabel(shape=(None,), max_shape=(10,)) + cl4 = ClassLabel() + cl4 = ClassLabel(shape=(5,)) + + test_image_inputs = [ "uint32", "int16", @@ -134,8 +210,8 @@ def test_classlabel_repr(): cl1 = ClassLabel(num_classes=5) cl2 = ClassLabel(names=["apple", "orange", "banana"]) - text1 = "ClassLabel(shape=(), dtype='int64', num_classes=5)" - text2 = "ClassLabel(shape=(), dtype='int64', names=['apple', 'orange', 'banana'], num_classes=3)" + text1 = "ClassLabel(shape=(), dtype='uint16', num_classes=5)" + text2 = "ClassLabel(shape=(), dtype='uint16', names=['apple', 'orange', 'banana'], num_classes=3)" assert cl1.__repr__() == text1 assert cl2.__repr__() == text2 From 97214c8f8969c3a37735439e7e94d49254c070ee Mon Sep 17 00:00:00 2001 From: AbhinavTuli Date: Mon, 8 Mar 2021 15:37:22 +0530 Subject: [PATCH 2/7] minor fixes --- hub/api/integrations.py | 7 +++++-- hub/schema/bbox.py | 3 +-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/hub/api/integrations.py b/hub/api/integrations.py index 06458c703c..40e64443ec 100644 --- a/hub/api/integrations.py +++ b/hub/api/integrations.py @@ -143,7 +143,7 @@ def my_transform(sample): return my_transform(dataset) -def _to_tensorflow(dataset, indexes=None, include_shapes=False): +def _to_tensorflow(dataset, indexes=None, include_shapes=False, repeat=False): """| Converts the dataset into a tensorflow compatible format Parameters @@ -186,7 +186,9 @@ def _get_active_item(key, index): def tf_gen(): key_dtype_map = {key: dataset[key, indexes[0]].dtype for key in dataset.keys} - for index in indexes: + i = 0 + while i < len(indexes): + index = indexes[i] d = {} for key in dataset.keys: split_key = key.split("/") @@ -208,6 +210,7 @@ def tf_gen(): ] cur[split_key[-1]] = value yield (d) + i = 0 if repeat and i == len(indexes) - 1 else i + 1 def dict_to_tf(my_dtype): d = {} diff --git a/hub/schema/bbox.py b/hub/schema/bbox.py index 435603e0fa..8fa256ec02 100644 --- a/hub/schema/bbox.py +++ b/hub/schema/bbox.py @@ -4,8 +4,7 @@ If a copy of the MPL was not distributed with this file, You can obtain one at https://mozilla.org/MPL/2.0/. """ -# from typing import Tuple - +from typing import Tuple from hub.schema.features import Tensor From 80e8cc4ce8d4f9026e084546c74931a6719d2b14 Mon Sep 17 00:00:00 2001 From: AbhinavTuli Date: Mon, 8 Mar 2021 15:44:46 +0530 Subject: [PATCH 3/7] linting fix --- hub/api/tests/test_dataset.py | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/hub/api/tests/test_dataset.py b/hub/api/tests/test_dataset.py index 469bb299ad..c120d25a59 100644 --- a/hub/api/tests/test_dataset.py +++ b/hub/api/tests/test_dataset.py @@ -56,7 +56,10 @@ def test_dataset_append_and_read(): shutil.rmtree("./data/test/test_dataset_append_and_read") ds = Dataset( - schema=dt, shape=(2,), url="./data/test/test_dataset_append_and_read", mode="a", + schema=dt, + shape=(2,), + url="./data/test/test_dataset_append_and_read", + mode="a", ) ds["first"][0] = 2.3 @@ -65,7 +68,10 @@ def test_dataset_append_and_read(): assert ds["second"][0].numpy() != 2.3 ds.close() - ds = Dataset(url="./data/test/test_dataset_append_and_read", mode="r",) + ds = Dataset( + url="./data/test/test_dataset_append_and_read", + mode="r", + ) assert ds.meta_information["description"] == "This is my description" ds.meta_information["hello"] = 5 ds.delete() @@ -154,10 +160,19 @@ def test_dataset_with_chunks(): def test_pickleability(url="./data/test/test_dataset_dynamic_shaped"): schema = { "first": Tensor( - shape=(None, None), dtype="int32", max_shape=(100, 100), chunks=(100,), + shape=(None, None), + dtype="int32", + max_shape=(100, 100), + chunks=(100,), ) } - ds = Dataset(url=url, token=None, shape=(1000,), mode="w", schema=schema,) + ds = Dataset( + url=url, + token=None, + shape=(1000,), + mode="w", + schema=schema, + ) ds["first"][0] = np.ones((10, 10)) @@ -179,7 +194,10 @@ def test_pickleability_gcs(): def test_dataset_dynamic_shaped(): schema = { "first": Tensor( - shape=(None, None), dtype="int32", max_shape=(100, 100), chunks=(100,), + shape=(None, None), + dtype="int32", + max_shape=(100, 100), + chunks=(100,), ) } ds = Dataset( @@ -733,7 +751,9 @@ def my_transform(annotation): assert ds["a", 30].compute() == np.array([0.2]) ds2 = Dataset(schema=my_schema, url="./data/casting3", shape=(100,)) - ds2["a", 0:100] = np.ones(100,) + ds2["a", 0:100] = np.ones( + 100, + ) assert ds2["a", 30].compute() == np.array([1]) From 26e27edfe62c0a5f58cf8b185ce7a78e303d6c52 Mon Sep 17 00:00:00 2001 From: AbhinavTuli Date: Mon, 8 Mar 2021 16:27:13 +0530 Subject: [PATCH 4/7] fixed classlabel bug --- hub/schema/class_label.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/hub/schema/class_label.py b/hub/schema/class_label.py index ce4aa17ccc..c44a2948b1 100644 --- a/hub/schema/class_label.py +++ b/hub/schema/class_label.py @@ -71,8 +71,9 @@ def __init__( """ self.check_shape(shape) super().__init__( - shape=(), - dtype="int64", + shape=shape, + max_shape=max_shape, + dtype="uint16", chunks=chunks, compressor=compressor, ) From 09f88c87558a02d305d7f1e76348fc10aa27c3fe Mon Sep 17 00:00:00 2001 From: AbhinavTuli Date: Mon, 8 Mar 2021 17:58:31 +0530 Subject: [PATCH 5/7] fixed infinite loop --- hub/api/dataset.py | 4 ++-- hub/api/datasetview.py | 4 ++-- hub/api/integrations.py | 6 ++---- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/hub/api/dataset.py b/hub/api/dataset.py index 2c0f5d4feb..1af98d98a9 100644 --- a/hub/api/dataset.py +++ b/hub/api/dataset.py @@ -584,7 +584,7 @@ def to_pytorch( ds = _to_pytorch(self, transform, inplace, output_type, indexes) return ds - def to_tensorflow(self, indexes=None, include_shapes=False, repeat=False): + def to_tensorflow(self, indexes=None, include_shapes=False): """| Converts the dataset into a tensorflow compatible format Parameters ---------- @@ -596,7 +596,7 @@ def to_tensorflow(self, indexes=None, include_shapes=False, repeat=False): """ from .integrations import _to_tensorflow - ds = _to_tensorflow(self, indexes, include_shapes, repeat=repeat) + ds = _to_tensorflow(self, indexes, include_shapes) return ds def _get_dictionary(self, subpath, slice_=None): diff --git a/hub/api/datasetview.py b/hub/api/datasetview.py index 3bec7990e4..442a8f58ed 100644 --- a/hub/api/datasetview.py +++ b/hub/api/datasetview.py @@ -249,7 +249,7 @@ def __str__(self): def __repr__(self): return self.__str__() - def to_tensorflow(self, include_shapes=False, repeat=False): + def to_tensorflow(self, include_shapes=False): """|Converts the dataset into a tensorflow compatible format Parameters @@ -260,7 +260,7 @@ def to_tensorflow(self, include_shapes=False, repeat=False): """ return self.dataset.to_tensorflow( - indexes=self.indexes, include_shapes=include_shapes, repeat=repeat + indexes=self.indexes, include_shapes=include_shapes ) def to_pytorch( diff --git a/hub/api/integrations.py b/hub/api/integrations.py index 40e64443ec..b24e87fcdc 100644 --- a/hub/api/integrations.py +++ b/hub/api/integrations.py @@ -143,7 +143,7 @@ def my_transform(sample): return my_transform(dataset) -def _to_tensorflow(dataset, indexes=None, include_shapes=False, repeat=False): +def _to_tensorflow(dataset, indexes=None, include_shapes=False): """| Converts the dataset into a tensorflow compatible format Parameters @@ -187,8 +187,7 @@ def _get_active_item(key, index): def tf_gen(): key_dtype_map = {key: dataset[key, indexes[0]].dtype for key in dataset.keys} i = 0 - while i < len(indexes): - index = indexes[i] + for index in indexes: d = {} for key in dataset.keys: split_key = key.split("/") @@ -210,7 +209,6 @@ def tf_gen(): ] cur[split_key[-1]] = value yield (d) - i = 0 if repeat and i == len(indexes) - 1 else i + 1 def dict_to_tf(my_dtype): d = {} From bbdd6d994f7de2aff523c081a2ec15eaa70bb996 Mon Sep 17 00:00:00 2001 From: AbhinavTuli Date: Mon, 8 Mar 2021 18:13:08 +0530 Subject: [PATCH 6/7] changed copy test locations --- hub/api/tests/test_dataset.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/hub/api/tests/test_dataset.py b/hub/api/tests/test_dataset.py index c120d25a59..616fa8764f 100644 --- a/hub/api/tests/test_dataset.py +++ b/hub/api/tests/test_dataset.py @@ -810,7 +810,7 @@ def test_dataset_copy_s3_local(): ) for i in range(100): ds["num", i] = 2 * i - ds2 = ds.copy("s3://snark-test/cp_copy_data_s3_1") + ds2 = ds.copy("s3://snark-test/cp_copy_data_s3_1_a") ds3 = ds2.copy("./data/testing/cp_copy_data_local_1") for i in range(100): assert ds2["num", i].compute() == 2 * i @@ -827,7 +827,7 @@ def test_dataset_copy_gcs_local(): ) for i in range(100): ds["num", i] = 2 * i - ds2 = ds.copy("gcs://snark-test/cp_copy_dataset_gcs_1") + ds2 = ds.copy("gcs://snark-test/cp_copy_dataset_gcs_1a") ds3 = ds2.copy("./data/testing/cp_copy_ds_local_2") for i in range(100): assert ds2["num", i].compute() == 2 * i @@ -884,12 +884,12 @@ def test_dataset_copy_hub_local(): ) def test_dataset_copy_gcs_s3(): ds = Dataset( - "s3://snark-test/cp_original_ds_s3_2", shape=(100,), schema=simple_schema + "s3://snark-test/cp_original_ds_s3_2_a", shape=(100,), schema=simple_schema ) for i in range(100): ds["num", i] = 2 * i - ds2 = ds.copy("gcs://snark-test/cp_copy_dataset_gcs_2") - ds3 = ds2.copy("s3://snark-test/cp_copy_ds_s3_3") + ds2 = ds.copy("gcs://snark-test/cp_copy_dataset_gcs_2_a") + ds3 = ds2.copy("s3://snark-test/cp_copy_ds_s3_3_a") for i in range(100): assert ds2["num", i].compute() == 2 * i assert ds3["num", i].compute() == 2 * i From 6d1e6c909cae95d6c8c8997b4386cd1c02adad6e Mon Sep 17 00:00:00 2001 From: AbhinavTuli Date: Mon, 8 Mar 2021 18:31:12 +0530 Subject: [PATCH 7/7] refactored to_tensorflow --- hub/api/integrations.py | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/hub/api/integrations.py b/hub/api/integrations.py index b24e87fcdc..042a1d3211 100644 --- a/hub/api/integrations.py +++ b/hub/api/integrations.py @@ -186,12 +186,10 @@ def _get_active_item(key, index): def tf_gen(): key_dtype_map = {key: dataset[key, indexes[0]].dtype for key in dataset.keys} - i = 0 for index in indexes: d = {} for key in dataset.keys: - split_key = key.split("/") - cur = d + split_key, cur = key.split("/"), d for i in range(1, len(split_key) - 1): if split_key[i] in cur.keys(): cur = cur[split_key[i]] @@ -201,13 +199,12 @@ def tf_gen(): cur[split_key[-1]] = _get_active_item(key, index) if isinstance(key_dtype_map[key], Text): value = cur[split_key[-1]] - if value.ndim == 1: - value = "".join(chr(it) for it in value.tolist()) - elif value.ndim == 2: - value = [ - "".join(chr(it) for it in val.tolist()) for val in value - ] - cur[split_key[-1]] = value + cur[split_key[-1]] = ( + "".join(chr(it) for it in value.tolist()) + if value.ndim == 1 + else ["".join(chr(it) for it in val.tolist()) for val in value] + ) + yield (d) def dict_to_tf(my_dtype): @@ -237,12 +234,10 @@ def dtype_to_tf(my_dtype): def get_output_shapes(my_dtype): if isinstance(my_dtype, SchemaDict): return output_shapes_from_dict(my_dtype) - elif isinstance(my_dtype, Text): + elif isinstance(my_dtype, (Text, Primitive)): return () elif isinstance(my_dtype, Tensor): return my_dtype.shape - elif isinstance(my_dtype, Primitive): - return () def output_shapes_from_dict(my_dtype): d = {}