treebeardtech
diff --git a/‎examples/benchmark/keras_asr.yaml
Lines changed: 30 additions & 0 deletions b/‎examples/benchmark/keras_asr.yaml
Lines changed: 30 additions & 0 deletions
diff --git a/‎examples/benchmark/keras_asr/callback.patch
Lines changed: 21 additions & 0 deletions b/‎examples/benchmark/keras_asr/callback.patch
Lines changed: 21 additions & 0 deletions
diff --git a/‎examples/benchmark/lightning_gan.yaml
Lines changed: 30 additions & 0 deletions b/‎examples/benchmark/lightning_gan.yaml
Lines changed: 30 additions & 0 deletions
diff --git a/‎examples/benchmark/lightning_gan/callback.patch
Lines changed: 21 additions & 0 deletions b/‎examples/benchmark/lightning_gan/callback.patch
Lines changed: 21 additions & 0 deletions
diff --git a/‎examples/benchmark/timm.yaml
Lines changed: 37 additions & 0 deletions b/‎examples/benchmark/timm.yaml
Lines changed: 37 additions & 0 deletions
diff --git a/‎examples/benchmark/timm/callback.patch
Lines changed: 35 additions & 0 deletions b/‎examples/benchmark/timm/callback.patch
Lines changed: 35 additions & 0 deletions
diff --git a/‎examples/benchmark/timm/dummy_dataset.patch
Lines changed: 71 additions & 0 deletions b/‎examples/benchmark/timm/dummy_dataset.patch
Lines changed: 71 additions & 0 deletions
diff --git a/‎examples/benchmark/transformers_qa.yaml
Lines changed: 42 additions & 0 deletions b/‎examples/benchmark/transformers_qa.yaml
Lines changed: 42 additions & 0 deletions
diff --git a/‎examples/benchmark/transformers_qa/callback.patch
Lines changed: 20 additions & 0 deletions b/‎examples/benchmark/transformers_qa/callback.patch
Lines changed: 20 additions & 0 deletions
diff --git a/‎sky/__init__.py
Lines changed: 2 additions & 0 deletions b/‎sky/__init__.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎sky/benchmark/__init__.py b/‎sky/benchmark/__init__.py
@@ -0,0 +1,30 @@
+name: ljspeech-asr
+
+resources:
+  candidates:
+  - {accelerators: T4}
+  - {accelerators: V100}
+
+workdir: ./examples/benchmark/keras_asr
+
+setup: |
+  conda create -n keras python=3.8 -y
+  conda activate keras
+
+  # Install SkyCallback
+  git clone [email protected]:sky-proj/sky.git
+  pip install sky/sky/callbacks/
+
+  # User setup
+  pip install numpy pandas tensorflow
+  git clone https://github.com/keras-team/keras-io.git
+  cd keras-io
+  git checkout 49a16474cc5bbf86792bb7557a70d13fdb7a9c97
+
+  # Apply the patch to enable SkyCallback
+  git apply ../callback.patch
+
+run: |
+  conda activate keras
+  cd keras-io/examples/audio/
+  python transformer_asr.py
@@ -0,0 +1,21 @@
+diff --git a/examples/audio/transformer_asr.py b/examples/audio/transformer_asr.py
+index 8cd3e04..5edf885 100644
+--- a/examples/audio/transformer_asr.py
++++ b/examples/audio/transformer_asr.py
+@@ -35,6 +35,7 @@ from glob import glob
+ import tensorflow as tf
+ from tensorflow import keras
+ from tensorflow.keras import layers
++from sky_callback import SkyKerasCallback
+ 
+ 
+ """
+@@ -520,7 +521,7 @@ learning_rate = CustomSchedule(
+ optimizer = keras.optimizers.Adam(learning_rate)
+ model.compile(optimizer=optimizer, loss=loss_fn)
+ 
+-history = model.fit(ds, validation_data=val_ds, callbacks=[display_cb], epochs=1)
++history = model.fit(ds, validation_data=val_ds, callbacks=[display_cb, SkyKerasCallback()], epochs=1)
+ 
+ """
+ In practice, you should train for around 100 epochs or more.
@@ -0,0 +1,30 @@
+name: mnist-gan
+
+resources:
+  candidates:
+  - {accelerators: T4}
+  - {accelerators: V100}
+
+workdir: ./examples/benchmark/lightning_gan
+
+setup: |
+  conda create -n pl python=3.8 -y
+  conda activate pl
+
+  # Install SkyCallback
+  git clone [email protected]:sky-proj/sky.git
+  pip install sky/sky/callbacks/
+
+  # User setup
+  pip install "torchvision" "pytorch-lightning>=1.4" "torch>=1.6, <1.9"
+  git clone https://github.com/Lightning-AI/tutorials.git
+  cd tutorials
+  git checkout e22e229921a97ea241277e19e0eaddedc35808cb
+
+  # Apply the patch to enable SkyCallback
+  git apply ../callback.patch
+
+run: |
+  conda activate pl
+  cd tutorials/lightning_examples/basic-gan/
+  python gan.py
@@ -0,0 +1,21 @@
+diff --git a/lightning_examples/basic-gan/gan.py b/lightning_examples/basic-gan/gan.py
+index 24520fa..4a1e988 100644
+--- a/lightning_examples/basic-gan/gan.py
++++ b/lightning_examples/basic-gan/gan.py
+@@ -11,6 +11,7 @@ from pytorch_lightning import LightningDataModule, LightningModule, Trainer
+ from pytorch_lightning.callbacks.progress import TQDMProgressBar
+ from torch.utils.data import DataLoader, random_split
+ from torchvision.datasets import MNIST
++from sky_callback import SkyLightningCallback
+ 
+ PATH_DATASETS = os.environ.get("PATH_DATASETS", ".")
+ BATCH_SIZE = 256 if torch.cuda.is_available() else 64
+@@ -253,7 +254,7 @@ trainer = Trainer(
+     accelerator="auto",
+     devices=1 if torch.cuda.is_available() else None,  # limiting got iPython runs
+     max_epochs=5,
+-    callbacks=[TQDMProgressBar(refresh_rate=20)],
++    callbacks=[TQDMProgressBar(refresh_rate=20), SkyLightningCallback()],
+ )
+ trainer.fit(model, dm)
+ 
@@ -0,0 +1,37 @@
+name: resnet50-randaug
+
+resources:
+  candidates:
+  - {accelerators: T4:4}
+  - {accelerators: V100:4}
+
+workdir: ./examples/benchmark/timm
+
+setup: |
+  conda create -n timm python=3.8 -y
+  conda activate timm
+
+  # Install SkyCallback
+  git clone [email protected]:sky-proj/sky.git
+  pip install sky/sky/callbacks/
+
+  # User setup
+  git clone https://github.com/rwightman/pytorch-image-models.git timm
+  cd timm
+  git checkout v0.5.4
+  pip install -r requirements.txt
+
+  # Apply the patch to enable SkyCallback
+  git apply ../callback.patch
+
+  # Apply the patch to use a dummy ImageNet dataset to avoid data downloading
+  git apply ../dummy_dataset.patch
+
+run: |
+  conda activate timm
+  cd timm
+  python3 -m torch.distributed.launch --nproc_per_node=4 train.py \
+    -b 64 --model resnet50 --sched cosine --epochs 200 --lr 0.05 \
+    --amp --remode pixel --reprob 0.6 --aug-splits 3 \
+    --aa rand-m9-mstd0.5-inc1 --resplit --split-bn --jsd \
+    --dist-bn reduce
@@ -0,0 +1,35 @@
+diff --git a/train.py b/train.py
+index 6e3b058..8c61ed4 100755
+--- a/train.py
++++ b/train.py
+@@ -58,6 +58,9 @@ try:
+ except ImportError: 
+     has_wandb = False
+ 
++import sky_callback
++from sky_callback import step_iterator
++
+ torch.backends.cudnn.benchmark = True
+ _logger = logging.getLogger('train')
+ 
+@@ -609,6 +612,11 @@ def main():
+         with open(os.path.join(output_dir, 'args.yaml'), 'w') as f:
+             f.write(args_text)
+ 
++    sky_callback.init(
++        global_rank=args.rank,
++        total_steps=num_epochs * len(loader_train),
++    )
++
+     try:
+         for epoch in range(start_epoch, num_epochs):
+             if args.distributed and hasattr(loader_train.sampler, 'set_epoch'):
+@@ -674,7 +682,7 @@ def train_one_epoch(
+     end = time.time()
+     last_idx = len(loader) - 1
+     num_updates = epoch * len(loader)
+-    for batch_idx, (input, target) in enumerate(loader):
++    for batch_idx, (input, target) in step_iterator(enumerate(loader)):
+         last_batch = batch_idx == last_idx
+         data_time_m.update(time.time() - end)
+         if not args.prefetcher:
@@ -0,0 +1,71 @@
+# A patch file to replace ImageNet with a dummy dataset.
+# Use only for benchmarking purposes.
+
+diff --git a/train.py b/train.py
+index 6e3b058..8ddbcdd 100755
+--- a/train.py
++++ b/train.py
+@@ -61,6 +61,34 @@ except ImportError:
+ torch.backends.cudnn.benchmark = True
+ _logger = logging.getLogger('train')
+ 
++
++class DummyImageDataset(torch.utils.data.Dataset):
++    """Dummy dataset with synthetic images."""
++    _IMAGE_HEIGHT = 3072
++    _IMAGE_WIDTH = 2304
++
++    def __init__(self, num_images, num_classes):
++        import numpy as np
++        from PIL import Image
++        imarray = np.random.rand(self._IMAGE_HEIGHT, self._IMAGE_WIDTH, 3) * 255
++        self.img = Image.fromarray(imarray.astype('uint8')).convert('RGB')
++        self.num_images = num_images
++        self.num_classes = num_classes
++        self.transform = None
++        self.target_transform = None
++
++    def __len__(self):
++        return self.num_images
++
++    def __getitem__(self, idx):
++        if self.transform is not None:
++            img = self.transform(self.img)
++        target = idx % self.num_classes
++        if self.target_transform is not None:
++            target = self.target_transform(target)
++        return img, target
++
++
+ # The first arg parser parses out only the --config argument, this argument is used to
+ # load a yaml file containing key-values that override the defaults for the main parser below
+ config_parser = parser = argparse.ArgumentParser(description='Training Config', add_help=False)
+@@ -71,8 +99,6 @@ parser.add_argument('-c', '--config', default='', type=str, metavar='FILE',
+ parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
+ 
+ # Dataset parameters
+-parser.add_argument('data_dir', metavar='DIR',
+-                    help='path to dataset')
+ parser.add_argument('--dataset', '-d', metavar='NAME', default='',
+                     help='dataset type (default: ImageFolder/ImageTar if empty)')
+ parser.add_argument('--train-split', metavar='NAME', default='train',
+@@ -486,17 +512,8 @@ def main():
+         _logger.info('Scheduled epochs: {}'.format(num_epochs))
+ 
+     # create the train and eval datasets
+-    dataset_train = create_dataset(
+-        args.dataset, root=args.data_dir, split=args.train_split, is_training=True,
+-        class_map=args.class_map,
+-        download=args.dataset_download,
+-        batch_size=args.batch_size,
+-        repeats=args.epoch_repeats)
+-    dataset_eval = create_dataset(
+-        args.dataset, root=args.data_dir, split=args.val_split, is_training=False,
+-        class_map=args.class_map,
+-        download=args.dataset_download,
+-        batch_size=args.batch_size)
++    dataset_train = DummyImageDataset(num_images=1231167, num_classes=1000)
++    dataset_eval = DummyImageDataset(num_images=50000, num_classes=1000)
+ 
+     # setup mixup / cutmix
+     collate_fn = None
@@ -0,0 +1,42 @@
+name: squad_v2
+
+resources:
+  candidates:
+  - {accelerators: T4:8}
+  - {accelerators: V100:8}
+
+workdir: ./examples/benchmark/transformers_qa
+
+setup: |
+  conda create -n hf python=3.8 -y
+  conda activate hf
+
+  # Install SkyCallback
+  git clone [email protected]:sky-proj/sky.git
+  pip install sky/sky/callbacks/
+
+  # User setup
+  pip install transformers
+  git clone https://github.com/huggingface/transformers.git
+  cd transformers
+  git checkout v4.20.0
+  pip install -r examples/pytorch/question-answering/requirements.txt
+
+  # Apply the patch to enable SkyCallback
+  git apply ../callback.patch
+
+run: |
+  conda activate hf
+  cd transformers/examples/pytorch/question-answering/
+  python run_qa.py \
+    --model_name_or_path bert-base-uncased \
+    --dataset_name squad_v2 \
+    --do_train \
+    --do_eval \
+    --per_device_train_batch_size 12 \
+    --learning_rate 3e-5 \
+    --num_train_epochs 2 \
+    --max_seq_length 384 \
+    --doc_stride 128 \
+    --version_2_with_negative \
+    --output_dir outputs/
@@ -0,0 +1,20 @@
+diff --git a/examples/pytorch/question-answering/run_qa.py b/examples/pytorch/question-answering/run_qa.py
+index f8f2ad7db..82fd64221 100755
+--- a/examples/pytorch/question-answering/run_qa.py
++++ b/examples/pytorch/question-answering/run_qa.py
+@@ -26,6 +26,7 @@ from typing import Optional
+ 
+ import datasets
+ from datasets import load_dataset, load_metric
++from sky_callback import SkyTransformersCallback
+ 
+ import transformers
+ from trainer_qa import QuestionAnsweringTrainer
+@@ -609,6 +610,7 @@ def main():
+         data_collator=data_collator,
+         post_process_function=post_processing_function,
+         compute_metrics=compute_metrics,
++        callbacks=[SkyTransformersCallback()],
+     )
+ 
+     # Training
@@ -3,6 +3,7 @@
 
 # Keep this order to avoid cyclic imports
 from sky import backends
+from sky import benchmark
 from sky import clouds
 from sky.clouds.service_catalog import list_accelerators
 from sky.dag import Dag, DagContext
@@ -33,6 +34,7 @@
     'Resources',
     'Task',
     'backends',
+    'benchmark',
     'launch',
     'exec',
     'list_accelerators',