From aa16ba0f9a3696036010c35b06c7a2866623ed3c Mon Sep 17 00:00:00 2001
From: AbhinavTuli <abhinav.tuli99@gmail.com>
Date: Mon, 8 Mar 2021 14:41:40 +0530
Subject: [PATCH 1/7] improvements to to_tensorflow, bbox, classlabels

---
 benchmarks/benchmark_sequential_write.py |  2 +-
 hub/api/dataset.py                       |  5 +-
 hub/api/datasetview.py                   |  8 ++-
 hub/api/integrations.py                  | 17 +++++
 hub/api/tensorview.py                    | 13 +++-
 hub/api/tests/test_dataset.py            | 34 +++-------
 hub/compute/ray.py                       |  2 +-
 hub/compute/transform.py                 |  4 +-
 hub/schema/bbox.py                       | 30 ++++++++-
 hub/schema/class_label.py                | 23 +++++--
 hub/schema/deserialize.py                |  6 ++
 hub/schema/sequence.py                   |  4 +-
 hub/schema/tests/test_features.py        | 80 +++++++++++++++++++++++-
 13 files changed, 180 insertions(+), 48 deletions(-)

diff --git a/benchmarks/benchmark_sequential_write.py b/benchmarks/benchmark_sequential_write.py
index f5bd3f0c76..6f86999da6 100644
--- a/benchmarks/benchmark_sequential_write.py
+++ b/benchmarks/benchmark_sequential_write.py
@@ -33,7 +33,7 @@ def time_batches(dataset, batch_size=1, num_batches=1, hub=False):
                 dataset["label"][
                     batch * batch_size : (batch + 1) * batch_size
                 ] = np.random.randint(10, size=(batch_size, 1))
-                dataset.commit()
+                dataset.flush()
             counter += 1
             t1 = time()
             print("Batch", counter, f"dt: {t1 - t0}")
diff --git a/hub/api/dataset.py b/hub/api/dataset.py
index 5b8b719560..2c0f5d4feb 100644
--- a/hub/api/dataset.py
+++ b/hub/api/dataset.py
@@ -584,9 +584,8 @@ def to_pytorch(
         ds = _to_pytorch(self, transform, inplace, output_type, indexes)
         return ds
 
-    def to_tensorflow(self, indexes=None, include_shapes=False):
+    def to_tensorflow(self, indexes=None, include_shapes=False, repeat=False):
         """| Converts the dataset into a tensorflow compatible format
-
         Parameters
         ----------
         indexes: list or int, optional
@@ -597,7 +596,7 @@ def to_tensorflow(self, indexes=None, include_shapes=False):
         """
         from .integrations import _to_tensorflow
 
-        ds = _to_tensorflow(self, indexes, include_shapes)
+        ds = _to_tensorflow(self, indexes, include_shapes, repeat=repeat)
         return ds
 
     def _get_dictionary(self, subpath, slice_=None):
diff --git a/hub/api/datasetview.py b/hub/api/datasetview.py
index 17418d5043..3bec7990e4 100644
--- a/hub/api/datasetview.py
+++ b/hub/api/datasetview.py
@@ -249,7 +249,7 @@ def __str__(self):
     def __repr__(self):
         return self.__str__()
 
-    def to_tensorflow(self, include_shapes):
+    def to_tensorflow(self, include_shapes=False, repeat=False):
         """|Converts the dataset into a tensorflow compatible format
 
         Parameters
@@ -260,7 +260,7 @@ def to_tensorflow(self, include_shapes):
         """
 
         return self.dataset.to_tensorflow(
-            indexes=self.indexes, include_shapes=include_shapes
+            indexes=self.indexes, include_shapes=include_shapes, repeat=repeat
         )
 
     def to_pytorch(
@@ -295,6 +295,10 @@ def commit(self) -> None:
         """Commit dataset"""
         self.dataset.commit()
 
+    def flush(self) -> None:
+        """Flush dataset"""
+        self.dataset.flush()
+
     def numpy(self, label_name=False):
         """Gets the value from different tensorview objects in the datasetview schema
 
diff --git a/hub/api/integrations.py b/hub/api/integrations.py
index 38c00a69da..06458c703c 100644
--- a/hub/api/integrations.py
+++ b/hub/api/integrations.py
@@ -185,6 +185,7 @@ def _get_active_item(key, index):
         return _active_chunks[key][index % samples_per_chunk]
 
     def tf_gen():
+        key_dtype_map = {key: dataset[key, indexes[0]].dtype for key in dataset.keys}
         for index in indexes:
             d = {}
             for key in dataset.keys:
@@ -197,6 +198,15 @@ def tf_gen():
                         cur[split_key[i]] = {}
                         cur = cur[split_key[i]]
                 cur[split_key[-1]] = _get_active_item(key, index)
+                if isinstance(key_dtype_map[key], Text):
+                    value = cur[split_key[-1]]
+                    if value.ndim == 1:
+                        value = "".join(chr(it) for it in value.tolist())
+                    elif value.ndim == 2:
+                        value = [
+                            "".join(chr(it) for it in val.tolist()) for val in value
+                        ]
+                    cur[split_key[-1]] = value
             yield (d)
 
     def dict_to_tf(my_dtype):
@@ -208,9 +218,14 @@ def dict_to_tf(my_dtype):
     def tensor_to_tf(my_dtype):
         return dtype_to_tf(my_dtype.dtype)
 
+    def text_to_tf(my_dtype):
+        return "string"
+
     def dtype_to_tf(my_dtype):
         if isinstance(my_dtype, SchemaDict):
             return dict_to_tf(my_dtype)
+        elif isinstance(my_dtype, Text):
+            return text_to_tf(my_dtype)
         elif isinstance(my_dtype, Tensor):
             return tensor_to_tf(my_dtype)
         elif isinstance(my_dtype, Primitive):
@@ -221,6 +236,8 @@ def dtype_to_tf(my_dtype):
     def get_output_shapes(my_dtype):
         if isinstance(my_dtype, SchemaDict):
             return output_shapes_from_dict(my_dtype)
+        elif isinstance(my_dtype, Text):
+            return ()
         elif isinstance(my_dtype, Tensor):
             return my_dtype.shape
         elif isinstance(my_dtype, Primitive):
diff --git a/hub/api/tensorview.py b/hub/api/tensorview.py
index 5769924ccb..0f33a0bda0 100644
--- a/hub/api/tensorview.py
+++ b/hub/api/tensorview.py
@@ -98,9 +98,18 @@ def numpy(self, label_name=False):
 
         if isinstance(self.dtype, hub.schema.class_label.ClassLabel) and label_name:
             if isinstance(self.indexes, int):
-                value = self.dtype.int2str(value)
+                if value.ndim == 0:
+                    value = self.dtype.int2str(value)
+                elif value.ndim == 1:
+                    value = [self.dtype.int2str(value[i]) for i in range(value.size)]
             else:
-                value = [self.dtype.int2str(value[i]) for i in range(value.size)]
+                if value.ndim == 1:
+                    value = [self.dtype.int2str(value[i]) for i in range(value.size)]
+                elif value.ndim == 2:
+                    value = [
+                        [self.dtype.int2str(item[i]) for i in range(item.size)]
+                        for item in value
+                    ]
 
         if isinstance(self.dtype, hub.schema.text.Text):
             if self.dataset.tokenizer is not None:
diff --git a/hub/api/tests/test_dataset.py b/hub/api/tests/test_dataset.py
index 6cce2bd951..469bb299ad 100644
--- a/hub/api/tests/test_dataset.py
+++ b/hub/api/tests/test_dataset.py
@@ -56,10 +56,7 @@ def test_dataset_append_and_read():
     shutil.rmtree("./data/test/test_dataset_append_and_read")
 
     ds = Dataset(
-        schema=dt,
-        shape=(2,),
-        url="./data/test/test_dataset_append_and_read",
-        mode="a",
+        schema=dt, shape=(2,), url="./data/test/test_dataset_append_and_read", mode="a",
     )
 
     ds["first"][0] = 2.3
@@ -68,10 +65,7 @@ def test_dataset_append_and_read():
     assert ds["second"][0].numpy() != 2.3
     ds.close()
 
-    ds = Dataset(
-        url="./data/test/test_dataset_append_and_read",
-        mode="r",
-    )
+    ds = Dataset(url="./data/test/test_dataset_append_and_read", mode="r",)
     assert ds.meta_information["description"] == "This is my description"
     ds.meta_information["hello"] = 5
     ds.delete()
@@ -160,19 +154,10 @@ def test_dataset_with_chunks():
 def test_pickleability(url="./data/test/test_dataset_dynamic_shaped"):
     schema = {
         "first": Tensor(
-            shape=(None, None),
-            dtype="int32",
-            max_shape=(100, 100),
-            chunks=(100,),
+            shape=(None, None), dtype="int32", max_shape=(100, 100), chunks=(100,),
         )
     }
-    ds = Dataset(
-        url=url,
-        token=None,
-        shape=(1000,),
-        mode="w",
-        schema=schema,
-    )
+    ds = Dataset(url=url, token=None, shape=(1000,), mode="w", schema=schema,)
 
     ds["first"][0] = np.ones((10, 10))
 
@@ -194,10 +179,7 @@ def test_pickleability_gcs():
 def test_dataset_dynamic_shaped():
     schema = {
         "first": Tensor(
-            shape=(None, None),
-            dtype="int32",
-            max_shape=(100, 100),
-            chunks=(100,),
+            shape=(None, None), dtype="int32", max_shape=(100, 100), chunks=(100,),
         )
     }
     ds = Dataset(
@@ -554,7 +536,7 @@ def test_append_dataset():
     assert ds["first"].shape[0] == 120
     assert ds["first", 5:10].shape[0] == 5
     assert ds["second"].shape[0] == 120
-    ds.commit()
+    ds.flush()
 
     ds = Dataset(url)
     assert ds["first"].shape[0] == 120
@@ -751,9 +733,7 @@ def my_transform(annotation):
     assert ds["a", 30].compute() == np.array([0.2])
 
     ds2 = Dataset(schema=my_schema, url="./data/casting3", shape=(100,))
-    ds2["a", 0:100] = np.ones(
-        100,
-    )
+    ds2["a", 0:100] = np.ones(100,)
     assert ds2["a", 30].compute() == np.array([1])
 
 
diff --git a/hub/compute/ray.py b/hub/compute/ray.py
index f602551079..817da70a3d 100644
--- a/hub/compute/ray.py
+++ b/hub/compute/ray.py
@@ -225,7 +225,7 @@ def upload(
 
         results = ray.get(tasks)
         self.set_dynamic_shapes(results, ds)
-        ds.commit()
+        ds.flush()
         return ds
 
     def set_dynamic_shapes(self, results, ds):
diff --git a/hub/compute/transform.py b/hub/compute/transform.py
index 86d1530f92..5f38a392b9 100644
--- a/hub/compute/transform.py
+++ b/hub/compute/transform.py
@@ -288,7 +288,7 @@ def upload_chunk(i_batch):
                     [slice(offset, offset + len(value))], value
                 )
 
-        ds.commit()
+        ds.flush()
         return ds
 
     def call_func(self, fn_index, item, as_list=False):
@@ -439,7 +439,7 @@ def batchify_generator(iterator: Iterable, size: int):
                 start += n_results
 
         ds_out.resize_shape(total)
-        ds_out.commit()
+        ds_out.flush()
         return ds_out
 
     @property
diff --git a/hub/schema/bbox.py b/hub/schema/bbox.py
index 2285babb1b..435603e0fa 100644
--- a/hub/schema/bbox.py
+++ b/hub/schema/bbox.py
@@ -16,11 +16,26 @@ class BBox(Tensor):
     normalized coordinates of the bounding box `[xmin, ymin, xmax, ymax]`
     """
 
-    def __init__(self, dtype="float64", chunks=None, compressor="lz4"):
+    def __init__(
+        self,
+        shape: Tuple[int, ...] = (4,),
+        max_shape: Tuple[int, ...] = None,
+        dtype="float64",
+        chunks=None,
+        compressor="lz4",
+    ):
         """Construct the connector.
 
         Parameters
         ----------
+        shape: tuple of ints or None
+            The shape of bounding box.
+            Will be (4,) if only one bounding box corresponding to each sample.
+            If N bboxes corresponding to each sample, shape should be (N,)
+            If the number of bboxes for each sample vary from 0 to M. The shape should be set to (None, 4) and max_shape should be set to (M, 4)
+            Defaults to (4,).
+        max_shape : Tuple[int], optional
+            Maximum shape of BBox
         dtype : str
                 dtype of bbox coordinates. Default: 'float32'
         chunks : Tuple[int] | True
@@ -29,8 +44,13 @@ def __init__(self, dtype="float64", chunks=None, compressor="lz4"):
             Sample Count is also in the list of tensor's dimensions (first dimension)
             If default value is chosen, automatically detects how to split into chunks
         """
+        self.check_shape(shape)
         super(BBox, self).__init__(
-            shape=(4,), dtype=dtype, chunks=chunks, compressor=compressor
+            shape=shape,
+            max_shape=max_shape,
+            dtype=dtype,
+            chunks=chunks,
+            compressor=compressor,
         )
 
     def __str__(self):
@@ -40,3 +60,9 @@ def __str__(self):
 
     def __repr__(self):
         return self.__str__()
+
+    def check_shape(self, shape):
+        if len(shape) not in [1, 2] or shape[-1] != 4:
+            raise ValueError(
+                "Wrong BBox shape provided, should be of the format (4,) or (None, 4) or (N, 4)"
+            )
diff --git a/hub/schema/class_label.py b/hub/schema/class_label.py
index b03cabe921..ce4aa17ccc 100644
--- a/hub/schema/class_label.py
+++ b/hub/schema/class_label.py
@@ -4,7 +4,7 @@
 If a copy of the MPL was not distributed with this file, You can obtain one at https://mozilla.org/MPL/2.0/.
 """
 
-from typing import List
+from typing import List, Tuple
 from hub.schema.features import Tensor
 
 
@@ -18,6 +18,8 @@ class ClassLabel(Tensor):
 
     def __init__(
         self,
+        shape: Tuple[int, ...] = (),
+        max_shape: Tuple[int, ...] = None,
         num_classes: int = None,
         names: List[str] = None,
         names_file: str = None,
@@ -40,15 +42,21 @@ def __init__(
 
         Parameters
         ----------
+        shape: tuple of ints or None
+            The shape of classlabel.
+            Will be () if only one classbabel corresponding to each sample.
+            If N classlabels corresponding to each sample, shape should be (N,)
+            If the number of classlabels for each sample vary from 0 to M. The shape should be set to (None,) and max_shape should be set to (M,)
+            Defaults to ().
+        max_shape : Tuple[int], optional
+            Maximum shape of ClassLabel
         num_classes: `int`
             number of classes. All labels must be < num_classes.
         names: `list<str>`
             string names for the integer classes. The order in which the names are provided is kept.
         names_file: `str`
             path to a file with names for the integer classes, one per line.
-        max_shape : Tuple[int]
-            Maximum shape of tensor shape if tensor is dynamic
-        chunks : Tuple[int] | True
+        chunks : Tuple[int] | True, optional
             Describes how to split tensor dimensions into chunks (files) to store them efficiently.
             It is anticipated that each file should be ~16MB.
             Sample Count is also in the list of tensor's dimensions (first dimension)
@@ -61,6 +69,7 @@ def __init__(
         ----------
         ValueError: If more than one argument is provided
         """
+        self.check_shape(shape)
         super().__init__(
             shape=(),
             dtype="int64",
@@ -158,3 +167,9 @@ def __str__(self):
 
     def __repr__(self):
         return self.__str__()
+
+    def check_shape(self, shape):
+        if len(shape) not in [0, 1]:
+            raise ValueError(
+                "Wrong ClassLabel shape provided, should be of the format () or (None,) or (N,)"
+            )
diff --git a/hub/schema/deserialize.py b/hub/schema/deserialize.py
index 3be18c4f49..c254650ab2 100644
--- a/hub/schema/deserialize.py
+++ b/hub/schema/deserialize.py
@@ -35,22 +35,28 @@ def deserialize(inp):
             )
         elif inp["type"] == "BBox":
             return BBox(
+                shape=tuple(inp["shape"]),
                 dtype=deserialize(inp["dtype"]),
                 chunks=inp["chunks"],
                 compressor=_get_compressor(inp),
+                max_shape=tuple(inp["max_shape"]),
             )
         elif inp["type"] == "ClassLabel":
             if inp["_names"] is not None:
                 return ClassLabel(
+                    shape=tuple(inp["shape"]),
                     names=inp["_names"],
                     chunks=inp["chunks"],
                     compressor=_get_compressor(inp),
+                    max_shape=tuple(inp["max_shape"]),
                 )
             else:
                 return ClassLabel(
+                    shape=tuple(inp["shape"]),
                     num_classes=inp["_num_classes"],
                     chunks=inp["chunks"],
                     compressor=_get_compressor(inp),
+                    max_shape=tuple(inp["max_shape"]),
                 )
         elif inp["type"] == "SchemaDict" or inp["type"] == "FeatureDict":
             d = {}
diff --git a/hub/schema/sequence.py b/hub/schema/sequence.py
index 280eb92415..4e06ef7b0f 100644
--- a/hub/schema/sequence.py
+++ b/hub/schema/sequence.py
@@ -12,12 +12,12 @@ class Sequence(Tensor):
     At generation time, a list for each of the sequence element is given. The output
     of `Dataset` will batch all the elements of the sequence together.
     If the length of the sequence is static and known in advance, it should be
-    specified in the constructor using the `length` param.
+    specified in the constructor using the `shape` param.
 
     | Usage:
     ----------
 
-    >>> sequence = Sequence(Image(), length=NB_FRAME)
+    >>> sequence = Sequence(shape=(5,), dtype = Image((100, 100, 3)))
     """
 
     def __init__(
diff --git a/hub/schema/tests/test_features.py b/hub/schema/tests/test_features.py
index 5eb3784178..e88d23c1ec 100644
--- a/hub/schema/tests/test_features.py
+++ b/hub/schema/tests/test_features.py
@@ -15,6 +15,8 @@
 from hub.schema.class_label import ClassLabel, _load_names_from_file
 from hub.schema.features import HubSchema, SchemaDict, Tensor
 import pytest
+from hub import Dataset
+import numpy as np
 
 
 def test_hub_feature_flatten():
@@ -94,6 +96,60 @@ def test_class_label():
         cl2.names = ["ab", "cd", "ef", "gh"]
 
 
+def test_class_label_2():
+    cl1 = ClassLabel(names=["apple", "banana", "cat"])
+    cl2 = ClassLabel((None,), (10,), names=["apple", "banana", "cat"])
+    cl3 = ClassLabel((3,), names=["apple", "banana", "cat"])
+    my_schema = {"cl1": cl1, "cl2": cl2, "cl3": cl3}
+
+    ds = Dataset("./data/cl_2d_3d", schema=my_schema, shape=(10), mode="w")
+
+    ds["cl1", 0] = cl1.str2int("cat")
+    ds["cl1", 1] = cl1.str2int("apple")
+    ds["cl1", 2] = cl1.str2int("apple")
+    ds["cl1", 3:5] = [cl1.str2int("banana"), cl1.str2int("banana")]
+    assert ds["cl1", 1].compute(True) == "apple"
+    assert ds["cl1", 0:3].compute(True) == ["cat", "apple", "apple"]
+    assert ds["cl1", 3:5].compute(True) == ["banana", "banana"]
+
+    ds["cl2", 0] = np.array(
+        [cl2.str2int("cat"), cl2.str2int("cat"), cl2.str2int("apple")]
+    )
+    ds["cl2", 1] = np.array([cl2.str2int("apple"), cl2.str2int("banana")])
+    ds["cl2", 2] = np.array(
+        [
+            cl2.str2int("cat"),
+            cl2.str2int("apple"),
+            cl2.str2int("banana"),
+            cl2.str2int("apple"),
+            cl2.str2int("banana"),
+        ]
+    )
+    ds["cl2", 3] = np.array([cl2.str2int("cat")])
+    assert ds["cl2", 0].compute(True) == ["cat", "cat", "apple"]
+    assert ds["cl2", 1].compute(True) == ["apple", "banana"]
+    assert ds["cl2", 2].compute(True) == ["cat", "apple", "banana", "apple", "banana"]
+    assert ds["cl2", 3].compute(True) == ["cat"]
+
+    ds["cl3", 0] = np.array(
+        [cl3.str2int("apple"), cl3.str2int("apple"), cl3.str2int("apple")]
+    )
+    ds["cl3", 1] = np.array(
+        [cl3.str2int("banana"), cl3.str2int("banana"), cl3.str2int("banana")]
+    )
+    ds["cl3", 2] = np.array(
+        [cl3.str2int("cat"), cl3.str2int("cat"), cl3.str2int("cat")]
+    )
+    assert ds["cl3", 0].compute(True) == ["apple", "apple", "apple"]
+    assert ds["cl3", 1].compute(True) == ["banana", "banana", "banana"]
+    assert ds["cl3", 2].compute(True) == ["cat", "cat", "cat"]
+    assert ds["cl3", 0:3].compute(True) == [
+        ["apple", "apple", "apple"],
+        ["banana", "banana", "banana"],
+        ["cat", "cat", "cat"],
+    ]
+
+
 def test_polygon():
     with pytest.raises(ValueError):
         poly1 = Polygon(shape=(11, 3))
@@ -101,6 +157,26 @@ def test_polygon():
         poly2 = Polygon(shape=(11, 4, 2))
 
 
+def test_bbox_shape():
+    with pytest.raises(ValueError):
+        bb1 = BBox(shape=(11, 3))
+    with pytest.raises(ValueError):
+        bb2 = BBox(shape=(11, 4, 2))
+    bb3 = BBox(shape=(None, 4), max_shape=(10, 4))
+    bb4 = BBox(shape=(4,))
+    bb4 = BBox(shape=(5, 4))
+
+
+def test_classlabel_shape():
+    with pytest.raises(ValueError):
+        cl1 = ClassLabel(shape=(11, 3))
+    with pytest.raises(ValueError):
+        cl2 = ClassLabel(shape=(11, 4, 2))
+    cl3 = ClassLabel(shape=(None,), max_shape=(10,))
+    cl4 = ClassLabel()
+    cl4 = ClassLabel(shape=(5,))
+
+
 test_image_inputs = [
     "uint32",
     "int16",
@@ -134,8 +210,8 @@ def test_classlabel_repr():
     cl1 = ClassLabel(num_classes=5)
     cl2 = ClassLabel(names=["apple", "orange", "banana"])
 
-    text1 = "ClassLabel(shape=(), dtype='int64', num_classes=5)"
-    text2 = "ClassLabel(shape=(), dtype='int64', names=['apple', 'orange', 'banana'], num_classes=3)"
+    text1 = "ClassLabel(shape=(), dtype='uint16', num_classes=5)"
+    text2 = "ClassLabel(shape=(), dtype='uint16', names=['apple', 'orange', 'banana'], num_classes=3)"
     assert cl1.__repr__() == text1
     assert cl2.__repr__() == text2
 

From 97214c8f8969c3a37735439e7e94d49254c070ee Mon Sep 17 00:00:00 2001
From: AbhinavTuli <abhinav.tuli99@gmail.com>
Date: Mon, 8 Mar 2021 15:37:22 +0530
Subject: [PATCH 2/7] minor fixes

---
 hub/api/integrations.py | 7 +++++--
 hub/schema/bbox.py      | 3 +--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/hub/api/integrations.py b/hub/api/integrations.py
index 06458c703c..40e64443ec 100644
--- a/hub/api/integrations.py
+++ b/hub/api/integrations.py
@@ -143,7 +143,7 @@ def my_transform(sample):
     return my_transform(dataset)
 
 
-def _to_tensorflow(dataset, indexes=None, include_shapes=False):
+def _to_tensorflow(dataset, indexes=None, include_shapes=False, repeat=False):
     """| Converts the dataset into a tensorflow compatible format
 
     Parameters
@@ -186,7 +186,9 @@ def _get_active_item(key, index):
 
     def tf_gen():
         key_dtype_map = {key: dataset[key, indexes[0]].dtype for key in dataset.keys}
-        for index in indexes:
+        i = 0
+        while i < len(indexes):
+            index = indexes[i]
             d = {}
             for key in dataset.keys:
                 split_key = key.split("/")
@@ -208,6 +210,7 @@ def tf_gen():
                         ]
                     cur[split_key[-1]] = value
             yield (d)
+            i = 0 if repeat and i == len(indexes) - 1 else i + 1
 
     def dict_to_tf(my_dtype):
         d = {}
diff --git a/hub/schema/bbox.py b/hub/schema/bbox.py
index 435603e0fa..8fa256ec02 100644
--- a/hub/schema/bbox.py
+++ b/hub/schema/bbox.py
@@ -4,8 +4,7 @@
 If a copy of the MPL was not distributed with this file, You can obtain one at https://mozilla.org/MPL/2.0/.
 """
 
-# from typing import Tuple
-
+from typing import Tuple
 from hub.schema.features import Tensor
 
 

From 80e8cc4ce8d4f9026e084546c74931a6719d2b14 Mon Sep 17 00:00:00 2001
From: AbhinavTuli <abhinav.tuli99@gmail.com>
Date: Mon, 8 Mar 2021 15:44:46 +0530
Subject: [PATCH 3/7] linting fix

---
 hub/api/tests/test_dataset.py | 32 ++++++++++++++++++++++++++------
 1 file changed, 26 insertions(+), 6 deletions(-)

diff --git a/hub/api/tests/test_dataset.py b/hub/api/tests/test_dataset.py
index 469bb299ad..c120d25a59 100644
--- a/hub/api/tests/test_dataset.py
+++ b/hub/api/tests/test_dataset.py
@@ -56,7 +56,10 @@ def test_dataset_append_and_read():
     shutil.rmtree("./data/test/test_dataset_append_and_read")
 
     ds = Dataset(
-        schema=dt, shape=(2,), url="./data/test/test_dataset_append_and_read", mode="a",
+        schema=dt,
+        shape=(2,),
+        url="./data/test/test_dataset_append_and_read",
+        mode="a",
     )
 
     ds["first"][0] = 2.3
@@ -65,7 +68,10 @@ def test_dataset_append_and_read():
     assert ds["second"][0].numpy() != 2.3
     ds.close()
 
-    ds = Dataset(url="./data/test/test_dataset_append_and_read", mode="r",)
+    ds = Dataset(
+        url="./data/test/test_dataset_append_and_read",
+        mode="r",
+    )
     assert ds.meta_information["description"] == "This is my description"
     ds.meta_information["hello"] = 5
     ds.delete()
@@ -154,10 +160,19 @@ def test_dataset_with_chunks():
 def test_pickleability(url="./data/test/test_dataset_dynamic_shaped"):
     schema = {
         "first": Tensor(
-            shape=(None, None), dtype="int32", max_shape=(100, 100), chunks=(100,),
+            shape=(None, None),
+            dtype="int32",
+            max_shape=(100, 100),
+            chunks=(100,),
         )
     }
-    ds = Dataset(url=url, token=None, shape=(1000,), mode="w", schema=schema,)
+    ds = Dataset(
+        url=url,
+        token=None,
+        shape=(1000,),
+        mode="w",
+        schema=schema,
+    )
 
     ds["first"][0] = np.ones((10, 10))
 
@@ -179,7 +194,10 @@ def test_pickleability_gcs():
 def test_dataset_dynamic_shaped():
     schema = {
         "first": Tensor(
-            shape=(None, None), dtype="int32", max_shape=(100, 100), chunks=(100,),
+            shape=(None, None),
+            dtype="int32",
+            max_shape=(100, 100),
+            chunks=(100,),
         )
     }
     ds = Dataset(
@@ -733,7 +751,9 @@ def my_transform(annotation):
     assert ds["a", 30].compute() == np.array([0.2])
 
     ds2 = Dataset(schema=my_schema, url="./data/casting3", shape=(100,))
-    ds2["a", 0:100] = np.ones(100,)
+    ds2["a", 0:100] = np.ones(
+        100,
+    )
     assert ds2["a", 30].compute() == np.array([1])
 
 

From 26e27edfe62c0a5f58cf8b185ce7a78e303d6c52 Mon Sep 17 00:00:00 2001
From: AbhinavTuli <abhinav.tuli99@gmail.com>
Date: Mon, 8 Mar 2021 16:27:13 +0530
Subject: [PATCH 4/7] fixed classlabel bug

---
 hub/schema/class_label.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/hub/schema/class_label.py b/hub/schema/class_label.py
index ce4aa17ccc..c44a2948b1 100644
--- a/hub/schema/class_label.py
+++ b/hub/schema/class_label.py
@@ -71,8 +71,9 @@ def __init__(
         """
         self.check_shape(shape)
         super().__init__(
-            shape=(),
-            dtype="int64",
+            shape=shape,
+            max_shape=max_shape,
+            dtype="uint16",
             chunks=chunks,
             compressor=compressor,
         )

From 09f88c87558a02d305d7f1e76348fc10aa27c3fe Mon Sep 17 00:00:00 2001
From: AbhinavTuli <abhinav.tuli99@gmail.com>
Date: Mon, 8 Mar 2021 17:58:31 +0530
Subject: [PATCH 5/7] fixed infinite loop

---
 hub/api/dataset.py      | 4 ++--
 hub/api/datasetview.py  | 4 ++--
 hub/api/integrations.py | 6 ++----
 3 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/hub/api/dataset.py b/hub/api/dataset.py
index 2c0f5d4feb..1af98d98a9 100644
--- a/hub/api/dataset.py
+++ b/hub/api/dataset.py
@@ -584,7 +584,7 @@ def to_pytorch(
         ds = _to_pytorch(self, transform, inplace, output_type, indexes)
         return ds
 
-    def to_tensorflow(self, indexes=None, include_shapes=False, repeat=False):
+    def to_tensorflow(self, indexes=None, include_shapes=False):
         """| Converts the dataset into a tensorflow compatible format
         Parameters
         ----------
@@ -596,7 +596,7 @@ def to_tensorflow(self, indexes=None, include_shapes=False, repeat=False):
         """
         from .integrations import _to_tensorflow
 
-        ds = _to_tensorflow(self, indexes, include_shapes, repeat=repeat)
+        ds = _to_tensorflow(self, indexes, include_shapes)
         return ds
 
     def _get_dictionary(self, subpath, slice_=None):
diff --git a/hub/api/datasetview.py b/hub/api/datasetview.py
index 3bec7990e4..442a8f58ed 100644
--- a/hub/api/datasetview.py
+++ b/hub/api/datasetview.py
@@ -249,7 +249,7 @@ def __str__(self):
     def __repr__(self):
         return self.__str__()
 
-    def to_tensorflow(self, include_shapes=False, repeat=False):
+    def to_tensorflow(self, include_shapes=False):
         """|Converts the dataset into a tensorflow compatible format
 
         Parameters
@@ -260,7 +260,7 @@ def to_tensorflow(self, include_shapes=False, repeat=False):
         """
 
         return self.dataset.to_tensorflow(
-            indexes=self.indexes, include_shapes=include_shapes, repeat=repeat
+            indexes=self.indexes, include_shapes=include_shapes
         )
 
     def to_pytorch(
diff --git a/hub/api/integrations.py b/hub/api/integrations.py
index 40e64443ec..b24e87fcdc 100644
--- a/hub/api/integrations.py
+++ b/hub/api/integrations.py
@@ -143,7 +143,7 @@ def my_transform(sample):
     return my_transform(dataset)
 
 
-def _to_tensorflow(dataset, indexes=None, include_shapes=False, repeat=False):
+def _to_tensorflow(dataset, indexes=None, include_shapes=False):
     """| Converts the dataset into a tensorflow compatible format
 
     Parameters
@@ -187,8 +187,7 @@ def _get_active_item(key, index):
     def tf_gen():
         key_dtype_map = {key: dataset[key, indexes[0]].dtype for key in dataset.keys}
         i = 0
-        while i < len(indexes):
-            index = indexes[i]
+        for index in indexes:
             d = {}
             for key in dataset.keys:
                 split_key = key.split("/")
@@ -210,7 +209,6 @@ def tf_gen():
                         ]
                     cur[split_key[-1]] = value
             yield (d)
-            i = 0 if repeat and i == len(indexes) - 1 else i + 1
 
     def dict_to_tf(my_dtype):
         d = {}

From bbdd6d994f7de2aff523c081a2ec15eaa70bb996 Mon Sep 17 00:00:00 2001
From: AbhinavTuli <abhinav.tuli99@gmail.com>
Date: Mon, 8 Mar 2021 18:13:08 +0530
Subject: [PATCH 6/7] changed copy test locations

---
 hub/api/tests/test_dataset.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/hub/api/tests/test_dataset.py b/hub/api/tests/test_dataset.py
index c120d25a59..616fa8764f 100644
--- a/hub/api/tests/test_dataset.py
+++ b/hub/api/tests/test_dataset.py
@@ -810,7 +810,7 @@ def test_dataset_copy_s3_local():
     )
     for i in range(100):
         ds["num", i] = 2 * i
-    ds2 = ds.copy("s3://snark-test/cp_copy_data_s3_1")
+    ds2 = ds.copy("s3://snark-test/cp_copy_data_s3_1_a")
     ds3 = ds2.copy("./data/testing/cp_copy_data_local_1")
     for i in range(100):
         assert ds2["num", i].compute() == 2 * i
@@ -827,7 +827,7 @@ def test_dataset_copy_gcs_local():
     )
     for i in range(100):
         ds["num", i] = 2 * i
-    ds2 = ds.copy("gcs://snark-test/cp_copy_dataset_gcs_1")
+    ds2 = ds.copy("gcs://snark-test/cp_copy_dataset_gcs_1a")
     ds3 = ds2.copy("./data/testing/cp_copy_ds_local_2")
     for i in range(100):
         assert ds2["num", i].compute() == 2 * i
@@ -884,12 +884,12 @@ def test_dataset_copy_hub_local():
 )
 def test_dataset_copy_gcs_s3():
     ds = Dataset(
-        "s3://snark-test/cp_original_ds_s3_2", shape=(100,), schema=simple_schema
+        "s3://snark-test/cp_original_ds_s3_2_a", shape=(100,), schema=simple_schema
     )
     for i in range(100):
         ds["num", i] = 2 * i
-    ds2 = ds.copy("gcs://snark-test/cp_copy_dataset_gcs_2")
-    ds3 = ds2.copy("s3://snark-test/cp_copy_ds_s3_3")
+    ds2 = ds.copy("gcs://snark-test/cp_copy_dataset_gcs_2_a")
+    ds3 = ds2.copy("s3://snark-test/cp_copy_ds_s3_3_a")
     for i in range(100):
         assert ds2["num", i].compute() == 2 * i
         assert ds3["num", i].compute() == 2 * i

From 6d1e6c909cae95d6c8c8997b4386cd1c02adad6e Mon Sep 17 00:00:00 2001
From: AbhinavTuli <abhinav.tuli99@gmail.com>
Date: Mon, 8 Mar 2021 18:31:12 +0530
Subject: [PATCH 7/7] refactored to_tensorflow

---
 hub/api/integrations.py | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

diff --git a/hub/api/integrations.py b/hub/api/integrations.py
index b24e87fcdc..042a1d3211 100644
--- a/hub/api/integrations.py
+++ b/hub/api/integrations.py
@@ -186,12 +186,10 @@ def _get_active_item(key, index):
 
     def tf_gen():
         key_dtype_map = {key: dataset[key, indexes[0]].dtype for key in dataset.keys}
-        i = 0
         for index in indexes:
             d = {}
             for key in dataset.keys:
-                split_key = key.split("/")
-                cur = d
+                split_key, cur = key.split("/"), d
                 for i in range(1, len(split_key) - 1):
                     if split_key[i] in cur.keys():
                         cur = cur[split_key[i]]
@@ -201,13 +199,12 @@ def tf_gen():
                 cur[split_key[-1]] = _get_active_item(key, index)
                 if isinstance(key_dtype_map[key], Text):
                     value = cur[split_key[-1]]
-                    if value.ndim == 1:
-                        value = "".join(chr(it) for it in value.tolist())
-                    elif value.ndim == 2:
-                        value = [
-                            "".join(chr(it) for it in val.tolist()) for val in value
-                        ]
-                    cur[split_key[-1]] = value
+                    cur[split_key[-1]] = (
+                        "".join(chr(it) for it in value.tolist())
+                        if value.ndim == 1
+                        else ["".join(chr(it) for it in val.tolist()) for val in value]
+                    )
+
             yield (d)
 
     def dict_to_tf(my_dtype):
@@ -237,12 +234,10 @@ def dtype_to_tf(my_dtype):
     def get_output_shapes(my_dtype):
         if isinstance(my_dtype, SchemaDict):
             return output_shapes_from_dict(my_dtype)
-        elif isinstance(my_dtype, Text):
+        elif isinstance(my_dtype, (Text, Primitive)):
             return ()
         elif isinstance(my_dtype, Tensor):
             return my_dtype.shape
-        elif isinstance(my_dtype, Primitive):
-            return ()
 
     def output_shapes_from_dict(my_dtype):
         d = {}