tf mirrored stratergy

codebasics · codebasics · commit ab25e112e434 · 2021-05-27T17:49:54.000-04:00
diff --git a/10_gpu_benchmarking/DGX_performance/dgx_benchamrking_tf_mirrored_stratergy.ipynb b/10_gpu_benchmarking/DGX_performance/dgx_benchamrking_tf_mirrored_stratergy.ipynb
@@ -44,7 +44,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 5,
    "metadata": {
     "scrolled": true
    },
@@ -53,11 +53,10 @@
      "data": {
       "text/plain": [
        "[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),\n",
-       " PhysicalDevice(name='/physical_device:XLA_CPU:0', device_type='XLA_CPU'),\n",
-       " PhysicalDevice(name='/physical_device:XLA_GPU:0', device_type='XLA_GPU')]"
+       " PhysicalDevice(name='/physical_device:XLA_CPU:0', device_type='XLA_CPU')]"
       ]
      },
-     "execution_count": 3,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -68,16 +67,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "'2.2.0'"
+       "'2.3.0'"
       ]
      },
-     "execution_count": 7,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -88,7 +87,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -97,7 +96,7 @@
        "True"
       ]
      },
-     "execution_count": 5,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -129,25 +128,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 8,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz\n",
-      "170500096/170498071 [==============================] - 42s 0us/step\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "(X_train, y_train), (X_test,y_test) = tf.keras.datasets.cifar10.load_data()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
@@ -156,7 +146,7 @@
        "(50000, 32, 32, 3)"
       ]
      },
-     "execution_count": 6,
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -167,7 +157,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 10,
    "metadata": {
     "scrolled": true
    },
@@ -178,7 +168,7 @@
        "(50000, 1)"
       ]
      },
-     "execution_count": 7,
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -401,7 +391,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 11,
    "metadata": {
     "scrolled": true
    },
@@ -413,7 +403,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -453,7 +443,7 @@
    "cell_type": "code",
    "execution_count": 21,
    "metadata": {
-    "scrolled": true
+    "scrolled": false
    },
    "outputs": [
     {
@@ -476,50 +466,106 @@
    ]
   },
   {
-   "cell_type": "markdown",
+   "cell_type": "code",
+   "execution_count": 15,
    "metadata": {},
+   "outputs": [],
    "source": [
-    "<h4 style=\"color:purple\">Model building and training</h4>"
+    "train_tf_dataset = tf.data.Dataset.from_tensor_slices((X_train_scaled, y_train_categorical))\n",
+    "test_tf_dataset = tf.data.Dataset.from_tensor_slices((X_test_scaled, y_test_categorical))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
-   "metadata": {
-    "scrolled": true
-   },
+   "execution_count": 19,
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "1563/1563 [==============================] - 22s 14ms/step - loss: 1.8602 - accuracy: 0.3321\n"
+      "WARNING:tensorflow:There are non-GPU devices in `tf.distribute.Strategy`, not using nccl allreduce.\n",
+      "INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:CPU:0',)\n"
      ]
-    },
+    }
+   ],
+   "source": [
+    "strategy = tf.distribute.MirroredStrategy()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
     {
      "data": {
       "text/plain": [
-       "<tensorflow.python.keras.callbacks.History at 0x7f697c1c82b0>"
+       "1"
       ]
      },
-     "execution_count": 22,
+     "execution_count": 20,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "model = keras.Sequential([\n",
-    "        keras.layers.Flatten(input_shape=(32,32,3)),\n",
-    "        keras.layers.Dense(3000, activation='relu'),\n",
-    "        keras.layers.Dense(1000, activation='relu'),\n",
-    "        keras.layers.Dense(10, activation='sigmoid')    \n",
-    "    ])\n",
+    "strategy.num_replicas_in_sync"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "BATCH_SIZE_PER_REPLICA = 64\n",
+    "BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync\n",
+    "SHUFFLE_BUFFER_SIZE = 100\n",
+    "\n",
+    "train_dataset = train_tf_dataset.shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE)\n",
+    "test_dataset = test_tf_dataset.batch(BATCH_SIZE)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<h4 style=\"color:purple\">Model building and training</h4>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "WARNING:tensorflow:From C:\\Users\\dhava\\AppData\\Roaming\\Python\\Python38\\site-packages\\tensorflow\\python\\data\\ops\\multi_device_iterator_ops.py:601: get_next_as_optional (from tensorflow.python.data.ops.iterator_ops) is deprecated and will be removed in a future version.\n",
+      "Instructions for updating:\n",
+      "Use `tf.data.Iterator.get_next_as_optional()` instead.\n",
+      "782/782 [==============================] - 20s 26ms/step - loss: 1.9170 - accuracy: 0.3119\n"
+     ]
+    }
+   ],
+   "source": [
+    "with strategy.scope():\n",
+    "    model = keras.Sequential([\n",
+    "            keras.layers.Flatten(input_shape=(32,32,3)),\n",
+    "            keras.layers.Dense(3000, activation='relu'),\n",
+    "            keras.layers.Dense(1000, activation='relu'),\n",
+    "            keras.layers.Dense(10, activation='sigmoid')    \n",
+    "        ])\n",
     "\n",
-    "model.compile(optimizer='SGD',\n",
-    "              loss='categorical_crossentropy',\n",
-    "              metrics=['accuracy'])\n",
+    "    model.compile(optimizer='SGD',\n",
+    "                  loss='categorical_crossentropy',\n",
+    "                  metrics=['accuracy'])\n",
     "\n",
-    "model.fit(X_train_scaled, y_train_categorical, epochs=1)"
+    "    model.fit(train_dataset, epochs=1)"
    ]
   },
   {