Pass Through Arguments Tensorflow (#170)

georgeSkoumas · web-flow · commit f6e09b6f9dab · 2022-07-27T12:31:01.000+03:00
* removed batch size, shuffle and prefetch from Tensorflow reader

* tensorflow notebook update

* PR changes
diff --git a/examples/readers/tensorflow_data_api_tiledb_dense.ipynb b/examples/readers/tensorflow_data_api_tiledb_dense.ipynb
@@ -38,7 +38,6 @@
    "execution_count": 2,
    "metadata": {
     "pycharm": {
-     "is_executing": true,
      "name": "#%%\n"
     }
    },
@@ -78,7 +77,6 @@
    "execution_count": 3,
    "metadata": {
     "pycharm": {
-     "is_executing": true,
      "name": "#%%\n"
     }
    },
@@ -126,7 +124,6 @@
    "execution_count": 4,
    "metadata": {
     "pycharm": {
-     "is_executing": true,
      "name": "#%%\n"
     }
    },
@@ -158,7 +155,6 @@
    "execution_count": 5,
    "metadata": {
     "pycharm": {
-     "is_executing": true,
      "name": "#%%\n"
     }
    },
@@ -220,15 +216,14 @@
    "execution_count": 6,
    "metadata": {
     "pycharm": {
-     "is_executing": true,
      "name": "#%%\n"
     }
    },
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "<matplotlib.image.AxesImage at 0x15cde5e80>"
+       "<matplotlib.image.AxesImage at 0x157c2e8e0>"
       ]
      },
      "execution_count": 6,
@@ -270,7 +265,6 @@
    "execution_count": 7,
    "metadata": {
     "pycharm": {
-     "is_executing": true,
      "name": "#%%\n"
     }
    },
@@ -309,7 +303,6 @@
    "execution_count": 8,
    "metadata": {
     "pycharm": {
-     "is_executing": true,
      "name": "#%%\n"
     }
    },
@@ -325,23 +318,23 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2022-07-05 18:13:38.031008: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA\n",
+      "2022-07-26 13:24:54.724292: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA\n",
       "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "938/938 [==============================] - 4s 4ms/step - loss: 0.3456 - accuracy: 0.9013\n",
+      "938/938 [==============================] - 5s 4ms/step - loss: 0.3414 - accuracy: 0.9013\n",
       "Epoch 2/5\n",
-      "938/938 [==============================] - 3s 3ms/step - loss: 0.1683 - accuracy: 0.9510\n",
+      "938/938 [==============================] - 4s 4ms/step - loss: 0.1662 - accuracy: 0.9515\n",
       "Epoch 3/5\n",
-      "938/938 [==============================] - 3s 3ms/step - loss: 0.1254 - accuracy: 0.9631\n",
+      "938/938 [==============================] - 4s 4ms/step - loss: 0.1252 - accuracy: 0.9631\n",
       "Epoch 4/5\n",
-      "938/938 [==============================] - 3s 3ms/step - loss: 0.1037 - accuracy: 0.9687\n",
+      "938/938 [==============================] - 4s 4ms/step - loss: 0.1021 - accuracy: 0.9694\n",
       "Epoch 5/5\n",
-      "938/938 [==============================] - 3s 3ms/step - loss: 0.0873 - accuracy: 0.9730\n"
+      "938/938 [==============================] - 4s 4ms/step - loss: 0.0878 - accuracy: 0.9735\n"
      ]
     }
    ],
@@ -355,8 +348,9 @@
     "    tiledb_dataset = TensorflowTileDBDataset(\n",
     "        ArrayParams(array=x, fields=['features']),\n",
     "        ArrayParams(array=y, fields=['features']),\n",
-    "        batch_size=64, shuffle_buffer_size=128\n",
-    "    )\n",
+    "        num_workers=2        \n",
+    "    )    \n",
+    "    tiledb_dataset = tiledb_dataset.batch(64).shuffle(128)\n",
     "    model.fit(tiledb_dataset, epochs=5)"
    ]
   },
@@ -365,7 +359,6 @@
    "execution_count": 9,
    "metadata": {
     "pycharm": {
-     "is_executing": true,
      "name": "#%%\n"
     }
    },
@@ -397,13 +390,6 @@
    "source": [
     "model.summary()"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
diff --git a/examples/readers/tensorflow_data_api_tiledb_sparse.ipynb b/examples/readers/tensorflow_data_api_tiledb_sparse.ipynb
@@ -669,39 +669,45 @@
     }
    },
    "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch 1/2\n"
+     ]
+    },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2022-06-22 17:31:04.946401: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA\n",
-      "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
-      "2022-06-22 17:31:05.168238: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)\n",
-      "/Users/konstantinostsitsimpikos/tileroot/TileDB-ML/venv2/lib/python3.9/site-packages/tensorflow/python/framework/indexed_slices.py:447: UserWarning: Converting sparse IndexedSlices(IndexedSlices(indices=Tensor(\"gradient_tape/sequential/dense/embedding_lookup_sparse/Reshape_1:0\", shape=(None,), dtype=int32), values=Tensor(\"gradient_tape/sequential/dense/embedding_lookup_sparse/Reshape:0\", shape=(None, 1), dtype=float32), dense_shape=Tensor(\"gradient_tape/sequential/dense/embedding_lookup_sparse/Cast:0\", shape=(2,), dtype=int32))) to a dense Tensor of unknown shape. This may consume a large amount of memory.\n",
-      "  warnings.warn(\n"
+      "2022-07-26 13:28:43.457755: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA\n",
+      "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Epoch 1/2\n",
-      "3125/3125 [==============================] - 2s 485us/step - loss: 0.0000e+00 - accuracy: 0.0607\n",
+      "3125/3125 [==============================] - 3s 832us/step - loss: 0.0000e+00 - accuracy: 0.0607\n",
       "Epoch 2/2\n",
-      "3125/3125 [==============================] - 2s 464us/step - loss: 0.0000e+00 - accuracy: 0.0611\n"
+      "3125/3125 [==============================] - 3s 697us/step - loss: 0.0000e+00 - accuracy: 0.0611\n"
      ]
     }
    ],
    "source": [
     "from tiledb.ml.readers.tensorflow import TensorflowTileDBDataset, ArrayParams\n",
     "\n",
+    "import warnings\n",
+    "warnings.filterwarnings(\"ignore\")\n",
+    "\n",
     "ctx = tiledb.Ctx({\"sm.memory_budget\": 1024**2, \"py.init_buffer_bytes\": 1024**2})\n",
     "with tiledb.open(training_images, ctx=ctx) as x, tiledb.open(training_labels, ctx=ctx) as y:\n",
     "    tiledb_dataset = TensorflowTileDBDataset(\n",
     "        ArrayParams(array=x, fields=['features']),\n",
-    "        ArrayParams(array=y, fields=['features']),\n",
-    "        batch_size=32)\n",
+    "        ArrayParams(array=y, fields=['features']))\n",
     "    model = design_model(input_shape=user_movie.shape[1])\n",
-    "    model.fit(tiledb_dataset, epochs=2, batch_size=32)"
+    "    tiledb_dataset = tiledb_dataset.batch(32)\n",
+    "    model.fit(tiledb_dataset, epochs=2)"
    ]
   }
  ],
@@ -726,4 +732,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
+}
diff --git a/tests/readers/test_tensorflow.py b/tests/readers/test_tensorflow.py
@@ -9,6 +9,12 @@
 from .utils import ingest_in_tiledb, parametrize_for_dataset, validate_tensor_generator
 
 
+def dataset_batching_shuffling(dataset: tf.data.Dataset, batch_size: int, shuffle_buffer_size: int) -> tf.data.Dataset:
+    if shuffle_buffer_size > 0:
+        dataset = dataset.shuffle(shuffle_buffer_size)
+    return dataset.batch(batch_size)
+
+
 class TestTensorflowTileDBDataset:
     @parametrize_for_dataset()
     def test_dataset(
@@ -19,9 +25,12 @@ def test_dataset(
                 dataset = TensorflowTileDBDataset(
                     x_params,
                     y_params,
+                    num_workers=num_workers,
+                )
+                dataset = dataset_batching_shuffling(
+                    dataset=dataset,
                     batch_size=batch_size,
                     shuffle_buffer_size=shuffle_buffer_size,
-                    num_workers=num_workers,
                 )
                 assert isinstance(dataset, tf.data.Dataset)
                 validate_tensor_generator(
@@ -42,8 +51,6 @@ def test_unequal_num_keys(
                     TensorflowTileDBDataset(
                         x_params,
                         y_params,
-                        batch_size=batch_size,
-                        shuffle_buffer_size=shuffle_buffer_size,
                         num_workers=num_workers,
                     )
                 assert "All arrays must have the same key range" in str(ex.value)
@@ -62,9 +69,12 @@ def test_dataset_order(
                 dataset = TensorflowTileDBDataset(
                     x_params,
                     y_params,
+                    num_workers=num_workers,
+                )
+                dataset = dataset_batching_shuffling(
+                    dataset=dataset,
                     batch_size=batch_size,
                     shuffle_buffer_size=shuffle_buffer_size,
-                    num_workers=num_workers,
                 )
                 # since num_fields is 0, fields are all the array attributes of each array
                 # the first item of each batch corresponds to the first attribute (="data")
diff --git a/tiledb/ml/readers/tensorflow.py b/tiledb/ml/readers/tensorflow.py
@@ -13,18 +13,10 @@
 
 def TensorflowTileDBDataset(
     *all_array_params: ArrayParams,
-    batch_size: int,
-    shuffle_buffer_size: int = 0,
-    prefetch: int = tf.data.AUTOTUNE,
     num_workers: int = 0,
 ) -> tf.data.Dataset:
     """Return a tf.data.Dataset for loading data from TileDB arrays.
-
     :param all_array_params: One or more `ArrayParams` instances, one per TileDB array.
-    :param batch_size: Size of each batch.
-    :param shuffle_buffer_size: Number of elements from which this dataset will sample.
-    :param prefetch: Maximum number of batches that will be buffered when prefetching.
-        By default, the buffer size is dynamically tuned.
     :param num_workers: If greater than zero, create a threadpool of `num_workers` threads
         used to fetch inputs asynchronously and in parallel. Note: when `num_workers` > 1
         yielded batches may be shuffled even if `shuffle_buffer_size` is zero.
@@ -60,9 +52,7 @@ def key_range_dataset(key_range_idx: int) -> tf.data.Dataset:
     else:
         dataset = key_range_dataset(0)
 
-    if shuffle_buffer_size > 0:
-        dataset = dataset.shuffle(shuffle_buffer_size)
-    return dataset.batch(batch_size).prefetch(prefetch)
+    return dataset
 
 
 _tensor_specs = {