Added Ray Train & Pytorch Lightning demo

Bobbins228 · Bobbins228 · commit 3defc67869ca · 2024-06-12T11:42:19.000+01:00
diff --git a/demo-notebooks/guided-demos/3_pytorch_lightning_demo.ipynb b/demo-notebooks/guided-demos/3_pytorch_lightning_demo.ipynb
@@ -0,0 +1,167 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In this notebook we are going to run a Ray Train & Pytorch Lightning script using the CodeFlare SDK and Ray Job Submission.\n",
+    "\n",
+    "NOTE: For distributed training an external persistent storage option should be set in the `run_config`.\n",
+    "You can find examples in the `pytorch_lightning.py` script."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Import pieces from codeflare-sdk\n",
+    "from codeflare_sdk import Cluster, ClusterConfiguration, TokenAuthentication"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create authentication object for user permissions\n",
+    "# IF unused, SDK will automatically check for default kubeconfig, then in-cluster config\n",
+    "# KubeConfigFileAuthentication can also be used to specify kubeconfig path manually\n",
+    "auth = TokenAuthentication(\n",
+    "    token = \"XXXXX\",\n",
+    "    server = \"XXXXX\",\n",
+    "    skip_tls=False\n",
+    ")\n",
+    "auth.login()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Once again, let's start by running through the same cluster setup as before:\n",
+    "\n",
+    "NOTE: We must specify the `image` which will be used in our RayCluster, we recommend you bring your own image which suits your purposes. \n",
+    "The example here is a community image."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create and configure our cluster object\n",
+    "# The SDK will try to find the name of your default local queue based on the annotation \"kueue.x-k8s.io/default-queue\": \"true\" unless you specify the local queue manually below\n",
+    "cluster = Cluster(ClusterConfiguration(\n",
+    "    name='raytest',\n",
+    "    namespace='default', # Update to your namespace\n",
+    "    num_workers=2,\n",
+    "    min_cpus=2,\n",
+    "    max_cpus=2,\n",
+    "    min_memory=8,\n",
+    "    max_memory=8,\n",
+    "    num_gpus=1,\n",
+    "    head_gpus=1,\n",
+    "    image=\"quay.io/project-codeflare/ray:latest-py39-cu118\",\n",
+    "    write_to_file=True, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources \n",
+    "    # local_queue=\"local-queue-name\" # Specify the local queue manually\n",
+    "))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Bring up the cluster\n",
+    "cluster.up()\n",
+    "cluster.wait_ready()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cluster.details()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Initialize the Job Submission Client\n",
+    "\"\"\"\n",
+    "The SDK will automatically gather the dashboard address and authenticate using the Ray Job Submission Client\n",
+    "\"\"\"\n",
+    "client = cluster.job_client"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Submit an example mnist job using the Job Submission Client\n",
+    "submission_id = client.submit_job(\n",
+    "    entrypoint=\"python pytorch_lightning.py\",\n",
+    "    runtime_env={\"working_dir\": \"./\",\"pip\": \"requirements_lightning.txt\"},\n",
+    ")\n",
+    "print(submission_id)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Get the job's logs\n",
+    "client.get_job_logs(submission_id)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Get the job's status\n",
+    "client.get_job_status(submission_id)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cluster.down()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "auth.logout()"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/demo-notebooks/guided-demos/pytorch_lightning.py b/demo-notebooks/guided-demos/pytorch_lightning.py
@@ -0,0 +1,116 @@
+import os
+import tempfile
+
+import torch
+from torch.utils.data import DataLoader
+from torchvision.models import resnet18
+from torchvision.datasets import FashionMNIST
+from torchvision.transforms import ToTensor, Normalize, Compose
+import lightning.pytorch as pl
+
+import ray.train.lightning
+from ray.train.torch import TorchTrainer
+
+# Based on https://docs.ray.io/en/latest/train/getting-started-pytorch-lightning.html
+
+"""
+# For S3 persistent storage replace the following environment variables with your AWS credentials then uncomment the S3 run_config
+# See here for information on how to set up an S3 bucket https://docs.aws.amazon.com/AmazonS3/latest/userguide/creating-bucket.html
+
+os.environ["AWS_ACCESS_KEY_ID"] = "XXXXXXXX"
+os.environ["AWS_SECRET_ACCESS_KEY"] = "XXXXXXXX"
+os.environ["AWS_DEFAULT_REGION"] = "XXXXXXXX"
+"""
+
+"""
+# For Minio persistent storage uncomment the following code and fill in the name, password and API URL then uncomment the minio run_config.
+# See here for information on how to set up a minio bucket https://ai-on-openshift.io/tools-and-applications/minio/minio/
+
+def get_minio_run_config():
+   import s3fs
+   import pyarrow.fs
+
+   s3_fs = s3fs.S3FileSystem(
+       key = os.getenv('MINIO_ACCESS_KEY', "XXXXX"),
+       secret = os.getenv('MINIO_SECRET_ACCESS_KEY', "XXXXX"),
+       endpoint_url = os.getenv('MINIO_URL', "XXXXX")
+   )
+
+   custom_fs = pyarrow.fs.PyFileSystem(pyarrow.fs.FSSpecHandler(s3_fs))
+
+   run_config = ray.train.RunConfig(storage_path='training', storage_filesystem=custom_fs)
+   return run_config
+"""
+
+
+# Model, Loss, Optimizer
+class ImageClassifier(pl.LightningModule):
+    def __init__(self):
+        super(ImageClassifier, self).__init__()
+        self.model = resnet18(num_classes=10)
+        self.model.conv1 = torch.nn.Conv2d(
+            1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False
+        )
+        self.criterion = torch.nn.CrossEntropyLoss()
+
+    def forward(self, x):
+        return self.model(x)
+
+    def training_step(self, batch, batch_idx):
+        x, y = batch
+        outputs = self.forward(x)
+        loss = self.criterion(outputs, y)
+        self.log("loss", loss, on_step=True, prog_bar=True)
+        return loss
+
+    def configure_optimizers(self):
+        return torch.optim.Adam(self.model.parameters(), lr=0.001)
+
+
+def train_func():
+    # Data
+    transform = Compose([ToTensor(), Normalize((0.5,), (0.5,))])
+    data_dir = os.path.join(tempfile.gettempdir(), "data")
+    train_data = FashionMNIST(
+        root=data_dir, train=True, download=True, transform=transform
+    )
+    train_dataloader = DataLoader(train_data, batch_size=128, shuffle=True)
+
+    # Training
+    model = ImageClassifier()
+    # [1] Configure PyTorch Lightning Trainer.
+    trainer = pl.Trainer(
+        max_epochs=10,
+        devices="auto",
+        accelerator="auto",
+        strategy=ray.train.lightning.RayDDPStrategy(),
+        plugins=[ray.train.lightning.RayLightningEnvironment()],
+        callbacks=[ray.train.lightning.RayTrainReportCallback()],
+        # [1a] Optionally, disable the default checkpointing behavior
+        # in favor of the `RayTrainReportCallback` above.
+        enable_checkpointing=False,
+    )
+    trainer = ray.train.lightning.prepare_trainer(trainer)
+    trainer.fit(model, train_dataloaders=train_dataloader)
+
+
+# [2] Configure scaling and resource requirements. Set the number of workers to the total number of GPUs on your Ray Cluster.
+scaling_config = ray.train.ScalingConfig(num_workers=3, use_gpu=True)
+
+# [3] Launch distributed training job.
+trainer = TorchTrainer(
+    train_func,
+    scaling_config=scaling_config,
+    # run_config = ray.train.RunConfig(storage_path="s3://BUCKET_NAME/SUB_PATH/", name="unique_run_name") # Uncomment and update the S3 URI for S3 persistent storage.
+    # run_config=get_minio_run_config(), # Uncomment for minio persistent storage.
+)
+result: ray.train.Result = trainer.fit()
+
+# [4] Load the trained model.
+with result.checkpoint.as_directory() as checkpoint_dir:
+    model = ImageClassifier.load_from_checkpoint(
+        os.path.join(
+            checkpoint_dir,
+            ray.train.lightning.RayTrainReportCallback.CHECKPOINT_NAME,
+        ),
+    )
diff --git a/demo-notebooks/guided-demos/requirements_lightning.txt b/demo-notebooks/guided-demos/requirements_lightning.txt
@@ -0,0 +1,5 @@
+torch==2.3.0
+torchvision==0.18.0
+lightning==2.2.5
+ray[train]==2.20.0
+s3fs==2024.6.0