Replace shuffle parameter with shuffle buffer size (#126)

gsakkis · web-flow · commit 7e5be1bbe545 · 2022-04-05T15:01:30.000+03:00
* Drop shuffle parameter from tensor_generator

* Replace shuffle parameter with shuffle_buffer_size (unused)

* Implement shuffle_buffer_size for TensorflowTileDBDataset

* Change PyTorchTileDBDataLoader from class to function

* Implement shuffle_buffer_size for PyTorchTileDBDataset
diff --git a/examples/cloud/models/pytorch_tiledb_cloud_ml_model_array.ipynb b/examples/cloud/models/pytorch_tiledb_cloud_ml_model_array.ipynb
@@ -32,7 +32,6 @@
     "\n",
     "epochs = 1\n",
     "batch_size_train = 128\n",
-    "batch_size_test = 1000\n",
     "learning_rate = 0.01\n",
     "momentum = 0.5\n",
     "log_interval = 10\n",
@@ -119,7 +118,7 @@
     "                               torchvision.transforms.Normalize(\n",
     "                                 (0.1307,), (0.3081,))\n",
     "                             ])),\n",
-    "  batch_size=batch_size_train, shuffle=True)"
+    "  batch_size=batch_size_train, shuffle_buffer_size=2*batch_size_train)"
    ]
   },
   {
diff --git a/examples/models/pytorch_tiledb_models_example.ipynb b/examples/models/pytorch_tiledb_models_example.ipynb
@@ -67,7 +67,6 @@
    "source": [
     "epochs = 1\n",
     "batch_size_train = 128\n",
-    "batch_size_test = 1000\n",
     "learning_rate = 0.01\n",
     "momentum = 0.5\n",
     "log_interval = 10\n",
@@ -108,7 +107,7 @@
     "                               torchvision.transforms.Normalize(\n",
     "                                 (0.1307,), (0.3081,))\n",
     "                             ])),\n",
-    "  batch_size=batch_size_train, shuffle=True)\n"
+    "  batch_size=batch_size_train, shuffle_buffer_size=2*batch_size_train)\n"
    ]
   },
   {
diff --git a/examples/readers/pytorch_data_api_tiledb_dense.ipynb b/examples/readers/pytorch_data_api_tiledb_dense.ipynb
@@ -372,7 +372,7 @@
     "    train_loader = PyTorchTileDBDataLoader(x_array=x, y_array=y,\n",
     "                                           batch_size=64,\n",
     "                                           buffer_bytes=1024**2,\n",
-    "                                           shuffle=True)\n",
+    "                                           shuffle_buffer_size=128)\n",
     "    net = Net(shape=(28, 28))\n",
     "    criterion = nn.CrossEntropyLoss()\n",
     "    optimizer = optim.SGD(net.parameters(), lr=0.01, momentum=0.5)\n",
diff --git a/examples/readers/tensorflow_data_api_tiledb_dense.ipynb b/examples/readers/tensorflow_data_api_tiledb_dense.ipynb
@@ -340,7 +340,7 @@
     "with tiledb.open(training_images) as x, tiledb.open(training_labels) as y:\n",
     "    tiledb_dataset = TensorflowTileDBDataset(\n",
     "        x_array=x, y_array=y, x_attrs=['features'], y_attrs=['features'], \n",
-    "        batch_size=64, buffer_bytes=1024**2, shuffle=True\n",
+    "        batch_size=64, buffer_bytes=1024**2, shuffle_buffer_size=128\n",
     "    )\n",
     "    model.fit(tiledb_dataset, epochs=5)"
    ]
diff --git a/tests/readers/test_pytorch.py b/tests/readers/test_pytorch.py
@@ -27,9 +27,9 @@ def test_dataset(
         y_shape,
         num_attrs,
         pass_attrs,
-        batch_size,
         buffer_bytes,
-        shuffle,
+        batch_size,
+        shuffle_buffer_size,
     ):
         with ingest_in_tiledb(
             tmpdir,
@@ -40,9 +40,7 @@ def test_dataset(
             num_attrs=num_attrs,
             pass_attrs=pass_attrs,
         ) as kwargs:
-            dataset = PyTorchTileDBDataset(
-                buffer_bytes=buffer_bytes, shuffle=shuffle, **kwargs
-            )
+            dataset = PyTorchTileDBDataset(buffer_bytes=buffer_bytes, **kwargs)
             assert isinstance(dataset, torch.utils.data.IterableDataset)
             validate_tensor_generator(
                 dataset, num_attrs, x_sparse, y_sparse, x_shape, y_shape
@@ -61,9 +59,9 @@ def test_dataloader(
         y_shape,
         num_attrs,
         pass_attrs,
-        batch_size,
         buffer_bytes,
-        shuffle,
+        batch_size,
+        shuffle_buffer_size,
     ):
         if num_workers and (x_sparse or y_sparse):
             pytest.skip("multiple workers not supported with sparse arrays")
@@ -81,7 +79,7 @@ def test_dataloader(
                 num_workers=num_workers,
                 buffer_bytes=buffer_bytes,
                 batch_size=batch_size,
-                shuffle=shuffle,
+                shuffle_buffer_size=shuffle_buffer_size,
                 **kwargs
             )
             assert isinstance(dataloader, torch.utils.data.DataLoader)
@@ -123,9 +121,9 @@ def test_unequal_num_rows(
         y_shape,
         num_attrs,
         pass_attrs,
-        batch_size,
         buffer_bytes,
-        shuffle,
+        batch_size,
+        shuffle_buffer_size,
     ):
         with ingest_in_tiledb(
             tmpdir,
@@ -142,12 +140,12 @@ def test_unequal_num_rows(
                     num_workers=num_workers,
                     buffer_bytes=buffer_bytes,
                     batch_size=batch_size,
-                    shuffle=shuffle,
+                    shuffle_buffer_size=shuffle_buffer_size,
                     **kwargs
                 )
             assert "X and Y arrays must have the same number of rows" in str(ex.value)
 
-    @parametrize_for_dataset(x_sparse=[True], shuffle=[False])
+    @parametrize_for_dataset(x_sparse=[True], shuffle_buffer_size=[0])
     def test_sparse_read_order(
         self,
         tmpdir,
@@ -158,9 +156,9 @@ def test_sparse_read_order(
         y_shape,
         num_attrs,
         pass_attrs,
-        batch_size,
         buffer_bytes,
-        shuffle,
+        batch_size,
+        shuffle_buffer_size,
     ):
         x_data = rand_array(num_rows, *x_shape, sparse=x_sparse)
         with ingest_in_tiledb(
@@ -175,7 +173,7 @@ def test_sparse_read_order(
             dataloader = PyTorchTileDBDataLoader(
                 buffer_bytes=buffer_bytes,
                 batch_size=batch_size,
-                shuffle=shuffle,
+                shuffle_buffer_size=shuffle_buffer_size,
                 **kwargs
             )
             generated_x_data = np.concatenate(
diff --git a/tests/readers/test_tensorflow.py b/tests/readers/test_tensorflow.py
@@ -38,9 +38,9 @@ def test_dataset(
         y_shape,
         num_attrs,
         pass_attrs,
-        batch_size,
         buffer_bytes,
-        shuffle,
+        batch_size,
+        shuffle_buffer_size,
     ):
         with ingest_in_tiledb(
             tmpdir,
@@ -54,7 +54,7 @@ def test_dataset(
             dataset = TensorflowTileDBDataset(
                 buffer_bytes=buffer_bytes,
                 batch_size=batch_size,
-                shuffle=shuffle,
+                shuffle_buffer_size=shuffle_buffer_size,
                 **kwargs,
             )
             assert isinstance(dataset, tf.data.Dataset)
@@ -67,7 +67,6 @@ def test_dataset(
             # covered so test it explicitly.
             generator = tensor_generator(
                 buffer_bytes=buffer_bytes,
-                shuffle=shuffle,
                 sparse_tensor_generator_cls=TensorflowSparseTileDBTensorGenerator,
                 **kwargs,
             )
@@ -87,9 +86,9 @@ def test_unequal_num_rows(
         y_shape,
         num_attrs,
         pass_attrs,
-        batch_size,
         buffer_bytes,
-        shuffle,
+        batch_size,
+        shuffle_buffer_size,
     ):
         with ingest_in_tiledb(
             tmpdir,
@@ -105,12 +104,12 @@ def test_unequal_num_rows(
                 TensorflowTileDBDataset(
                     buffer_bytes=buffer_bytes,
                     batch_size=batch_size,
-                    shuffle=shuffle,
+                    shuffle_buffer_size=shuffle_buffer_size,
                     **kwargs,
                 )
             assert "X and Y arrays must have the same number of rows" in str(ex.value)
 
-    @parametrize_for_dataset(x_sparse=[True], shuffle=[False])
+    @parametrize_for_dataset(x_sparse=[True], shuffle_buffer_size=[0])
     def test_sparse_read_order(
         self,
         tmpdir,
@@ -121,9 +120,9 @@ def test_sparse_read_order(
         y_shape,
         num_attrs,
         pass_attrs,
-        batch_size,
         buffer_bytes,
-        shuffle,
+        batch_size,
+        shuffle_buffer_size,
     ):
         x_data = rand_array(num_rows, *x_shape, sparse=x_sparse)
         with ingest_in_tiledb(
@@ -138,7 +137,7 @@ def test_sparse_read_order(
             dataset = TensorflowTileDBDataset(
                 buffer_bytes=buffer_bytes,
                 batch_size=batch_size,
-                shuffle=shuffle,
+                shuffle_buffer_size=shuffle_buffer_size,
                 **kwargs,
             )
             generated_x_data = np.concatenate(
diff --git a/tests/readers/utils.py b/tests/readers/utils.py
@@ -20,7 +20,7 @@ def parametrize_for_dataset(
     pass_attrs=(True, False),
     batch_size=(8,),
     buffer_bytes=(1024, None),
-    shuffle=(True, False),
+    shuffle_buffer_size=(0, 16),
 ):
     def is_valid_combination(t):
         x_sparse_, y_sparse_, x_shape_, y_shape_, *_ = t
@@ -36,9 +36,9 @@ def is_valid_combination(t):
         "y_shape",
         "num_attrs",
         "pass_attrs",
-        "batch_size",
         "buffer_bytes",
-        "shuffle",
+        "batch_size",
+        "shuffle_buffer_size",
     ]
     argvalues = filter(
         is_valid_combination,
@@ -49,9 +49,9 @@ def is_valid_combination(t):
             y_shape,
             num_attrs,
             pass_attrs,
-            batch_size,
             buffer_bytes,
-            shuffle,
+            batch_size,
+            shuffle_buffer_size,
         ),
     )
     return pytest.mark.parametrize(argnames, argvalues)
diff --git a/tiledb/ml/readers/_batch_utils.py b/tiledb/ml/readers/_batch_utils.py
@@ -36,18 +36,6 @@ def read_buffer(self, array_slice: slice) -> None:
         :param array_slice: Requested array slice.
         """
 
-    @abstractmethod
-    def shuffle_buffer(self, buffer_slice: slice, row_idxs: np.ndarray) -> None:
-        """
-        Shuffle a slice of the current buffer.
-
-        Must be called after `read_buffer`.
-
-        :param buffer_slice: Slice of the current buffer to shuffle.
-        :param row_idxs: Shuffled indices; a shuffled version of
-            `np.arange(0, buffer_slice.stop - buffer_slice.start)`
-        """
-
     @abstractmethod
     def iter_tensors(self, buffer_slice: slice) -> Iterator[Tensor]:
         """
@@ -63,10 +51,6 @@ class TileDBNumpyGenerator(TileDBTensorGenerator[np.ndarray]):
     def read_buffer(self, array_slice: slice) -> None:
         self._buf_arrays = tuple(self._query[array_slice].values())
 
-    def shuffle_buffer(self, buffer_slice: slice, row_idxs: np.ndarray) -> None:
-        for buf_array in self._buf_arrays:
-            buf_array[buffer_slice] = buf_array[buffer_slice.start + row_idxs]
-
     def iter_tensors(self, buffer_slice: slice) -> Iterator[np.ndarray]:
         for buf_array in self._buf_arrays:
             yield buf_array[buffer_slice]
@@ -105,10 +89,6 @@ def read_buffer(self, array_slice: slice) -> None:
             for data in buffer.values()
         )
 
-    def shuffle_buffer(self, buffer_slice: slice, row_idxs: np.ndarray) -> None:
-        for buf_csr in self._buf_csrs:
-            buf_csr[buffer_slice] = buf_csr[buffer_slice.start + row_idxs]
-
     def iter_tensors(self, buffer_slice: slice) -> Iterator[Tensor]:
         for buf_csr, dtype in zip(self._buf_csrs, self._attr_dtypes):
             batch_csr = buf_csr[buffer_slice]
@@ -137,7 +117,6 @@ def tensor_generator(
     x_array: tiledb.Array,
     y_array: tiledb.Array,
     buffer_bytes: Optional[int] = None,
-    shuffle: bool = False,
     x_attrs: Sequence[str] = (),
     y_attrs: Sequence[str] = (),
     start_offset: int = 0,
@@ -159,7 +138,6 @@ def tensor_generator(
     :param y_array: TileDB array of the labels.
     :param buffer_bytes: Maximum size (in bytes) of memory to allocate for reading from
         each array (default=`tiledb.default_ctx().config()["sm.memory_budget"]`).
-    :param shuffle: True for shuffling rows.
     :param x_attrs: Attribute names of x_array; defaults to all x_array attributes.
     :param y_attrs: Attribute names of y_array; defaults to all y_array attributes
     :param start_offset: Start row offset; defaults to 0.
@@ -205,12 +183,6 @@ def get_buffer_size_generator(
             elif batch.y_read_slice:
                 y_gen.read_buffer(batch.y_read_slice)
 
-            if shuffle:
-                row_idxs = np.arange(batch.size)
-                np.random.shuffle(row_idxs)
-                x_gen.shuffle_buffer(batch.x_buffer_slice, row_idxs)
-                y_gen.shuffle_buffer(batch.y_buffer_slice, row_idxs)
-
             x_tensors = x_gen.iter_tensors(batch.x_buffer_slice)
             y_tensors = y_gen.iter_tensors(batch.y_buffer_slice)
             yield (*x_tensors, *y_tensors)
diff --git a/tiledb/ml/readers/pytorch.py b/tiledb/ml/readers/pytorch.py
diff --git a/tiledb/ml/readers/tensorflow.py b/tiledb/ml/readers/tensorflow.py

Original file line number	Diff line number	Diff line change
`@@ -340,7 +340,7 @@`
`340`	`340`	`"with tiledb.open(training_images) as x, tiledb.open(training_labels) as y:\n",`
`341`	`341`	`" tiledb_dataset = TensorflowTileDBDataset(\n",`
`342`	`342`	`" x_array=x, y_array=y, x_attrs=['features'], y_attrs=['features'], \n",`
`343`		`- " batch_size=64, buffer_bytes=1024**2, shuffle=True\n",`
	`343`	`+ " batch_size=64, buffer_bytes=1024**2, shuffle_buffer_size=128\n",`
`344`	`344`	`" )\n",`
`345`	`345`	`" model.fit(tiledb_dataset, epochs=5)"`
`346`	`346`	`]`