From d1e233c606e1928a29fa3763952f97ac54751864 Mon Sep 17 00:00:00 2001 From: knoriy Date: Tue, 2 Aug 2022 18:27:30 +0000 Subject: [PATCH 1/7] add laion sbatch --- slurm.sh | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 slurm.sh diff --git a/slurm.sh b/slurm.sh new file mode 100644 index 0000000..4d51d2b --- /dev/null +++ b/slurm.sh @@ -0,0 +1,22 @@ +#!/bin/bash +#SBATCH --partition=compute-od-gpu +#SBATCH --job-name=pl_test +#SBATCH --nodes=10 +#SBATCH --exclusive +#SBATCH --output=%x_%j.out + +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/nccl/build/lib:/opt/aws-ofi-nccl-install/lib +export NCCL_PROTO=simple +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/aws-ofi-nccl/lib +export PATH=$PATH:/opt/amazon/efa/bin:/opt/amazon/openmpi/bin +export FI_EFA_FORK_SAFE=1 +export FI_LOG_LEVEL=1 +export FI_EFA_USE_DEVICE_RDMA=1 # use for p4dn +export NCCL_DEBUG=info +export OMPI_MCA_mtl_base_verbose=1 +export FI_EFA_ENABLE_SHM_TRANSFER=0 +export FI_PROVIDER=efa +export FI_EFA_TX_MIN_CREDITS=64 +export NCCL_TREE_THRESHOLD=0 + +srun /home/knoriy/fsx/miniconda3/envs/pl/bin/python /home/knoriy/deep-learning-project-template/project/lit_mnist.py --accelerator gpu --strategy ddp From c104b48eb86e388959332158679337af6148d2df Mon Sep 17 00:00:00 2001 From: knoriy Date: Fri, 5 Aug 2022 09:44:19 +0000 Subject: [PATCH 2/7] _ --- project/lit_mnist.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/project/lit_mnist.py b/project/lit_mnist.py index 8733378..fb50fb2 100644 --- a/project/lit_mnist.py +++ b/project/lit_mnist.py @@ -60,6 +60,8 @@ def cli_main(): # ------------ parser = ArgumentParser() parser.add_argument('--batch_size', default=32, type=int) + parser.add_argument('--num_workers', default=1, type=int) + parser = pl.Trainer.add_argparse_args(parser) parser = LitClassifier.add_model_specific_args(parser) args = parser.parse_args() @@ -71,9 +73,9 @@ def cli_main(): mnist_test = MNIST('', train=False, download=True, transform=transforms.ToTensor()) mnist_train, mnist_val = random_split(dataset, [55000, 5000]) - train_loader = DataLoader(mnist_train, batch_size=args.batch_size) - val_loader = DataLoader(mnist_val, batch_size=args.batch_size) - test_loader = DataLoader(mnist_test, batch_size=args.batch_size) + train_loader = DataLoader(mnist_train, batch_size=args.batch_size, num_workers=args.num_workers) + val_loader = DataLoader(mnist_val, batch_size=args.batch_size, num_workers=args.num_workers) + test_loader = DataLoader(mnist_test, batch_size=args.batch_size, num_workers=args.num_workers) # ------------ # model From 3e865b7ad409d240a9394b50a64c6b93df4e83a2 Mon Sep 17 00:00:00 2001 From: Kari Noriy Date: Tue, 9 Aug 2022 16:52:11 +0100 Subject: [PATCH 3/7] changes lit_mnits.py to use LightningDataModule --- project/lit_mnist.py | 191 +++++++++++++++++++++++++------------------ 1 file changed, 110 insertions(+), 81 deletions(-) diff --git a/project/lit_mnist.py b/project/lit_mnist.py index fb50fb2..6027063 100644 --- a/project/lit_mnist.py +++ b/project/lit_mnist.py @@ -1,6 +1,8 @@ from argparse import ArgumentParser import torch +from torch import nn + import pytorch_lightning as pl from torch.nn import functional as F from torch.utils.data import DataLoader, random_split @@ -8,91 +10,118 @@ from torchvision.datasets.mnist import MNIST from torchvision import transforms +from typing import Optional -class LitClassifier(pl.LightningModule): - def __init__(self, hidden_dim=128, learning_rate=1e-3): - super().__init__() - self.save_hyperparameters() - - self.l1 = torch.nn.Linear(28 * 28, self.hparams.hidden_dim) - self.l2 = torch.nn.Linear(self.hparams.hidden_dim, 10) - - def forward(self, x): - x = x.view(x.size(0), -1) - x = torch.relu(self.l1(x)) - x = torch.relu(self.l2(x)) - return x - - def training_step(self, batch, batch_idx): - x, y = batch - y_hat = self(x) - loss = F.cross_entropy(y_hat, y) - return loss - - def validation_step(self, batch, batch_idx): - x, y = batch - y_hat = self(x) - loss = F.cross_entropy(y_hat, y) - self.log('valid_loss', loss) - - def test_step(self, batch, batch_idx): - x, y = batch - y_hat = self(x) - loss = F.cross_entropy(y_hat, y) - self.log('test_loss', loss) - - def configure_optimizers(self): - return torch.optim.Adam(self.parameters(), lr=self.hparams.learning_rate) - - @staticmethod - def add_model_specific_args(parent_parser): - parser = ArgumentParser(parents=[parent_parser], add_help=False) - parser.add_argument('--hidden_dim', type=int, default=128) - parser.add_argument('--learning_rate', type=float, default=0.0001) - return parser + +class MyModule(nn.Module): + ''' + Class_Discription + ''' + def __init__(self, hidden_dim) -> None: + super().__init__() + self.l1 = torch.nn.Linear(28 * 28, hidden_dim) + self.l2 = torch.nn.Linear(hidden_dim, 10) + + def forward(self, x): + x = x.view(x.size(0), -1) + x = torch.relu(self.l1(x)) + x = torch.relu(self.l2(x)) + return x +class LitClassifier(pl.LightningModule): + def __init__(self, hidden_dim=128, learning_rate=1e-3): + super().__init__() + self.save_hyperparameters() + + self.model = MyModule(self.hparams.hidden_dim) + + def forward(self, x): + return self.model(x) + + def training_step(self, batch, batch_idx): + x, y = batch + y_hat = self(x) + loss = F.cross_entropy(y_hat, y) + return loss + + def validation_step(self, batch, batch_idx): + x, y = batch + y_hat = self(x) + loss = F.cross_entropy(y_hat, y) + self.log('valid_loss', loss) + + def test_step(self, batch, batch_idx): + x, y = batch + y_hat = self(x) + loss = F.cross_entropy(y_hat, y) + self.log('test_loss', loss) + + def configure_optimizers(self): + return torch.optim.Adam(self.parameters(), lr=self.hparams.learning_rate) + + @staticmethod + def add_model_specific_args(parent_parser): + parser = ArgumentParser(parents=[parent_parser], add_help=False) + parser.add_argument('--hidden_dim', type=int, default=128) + parser.add_argument('--learning_rate', type=float, default=0.0001) + return parser + +class MNISTDataModule(pl.LightningDataModule): + def __init__(self, data_dir: str = "", batch_size: int = 32): + super().__init__() + self.data_dir = data_dir + self.batch_size = batch_size + + def setup(self, stage:Optional[str] = None): + dataset = MNIST('', train=True, download=True, transform=transforms.ToTensor()) + self.mnist_test = MNIST('', train=False, download=True, transform=transforms.ToTensor()) + self.mnist_train, self.mnist_val = random_split(dataset, [55000, 5000]) + + def train_dataloader(self): + return DataLoader(self.mnist_train, batch_size=self.batch_size) + + def val_dataloader(self): + return DataLoader(self.mnist_val, batch_size=self.batch_size) + + def test_dataloader(self): + return DataLoader(self.mnist_test, batch_size=self.batch_size) + def cli_main(): - pl.seed_everything(1234) - - # ------------ - # args - # ------------ - parser = ArgumentParser() - parser.add_argument('--batch_size', default=32, type=int) - parser.add_argument('--num_workers', default=1, type=int) - - parser = pl.Trainer.add_argparse_args(parser) - parser = LitClassifier.add_model_specific_args(parser) - args = parser.parse_args() - - # ------------ - # data - # ------------ - dataset = MNIST('', train=True, download=True, transform=transforms.ToTensor()) - mnist_test = MNIST('', train=False, download=True, transform=transforms.ToTensor()) - mnist_train, mnist_val = random_split(dataset, [55000, 5000]) - - train_loader = DataLoader(mnist_train, batch_size=args.batch_size, num_workers=args.num_workers) - val_loader = DataLoader(mnist_val, batch_size=args.batch_size, num_workers=args.num_workers) - test_loader = DataLoader(mnist_test, batch_size=args.batch_size, num_workers=args.num_workers) - - # ------------ - # model - # ------------ - model = LitClassifier(args.hidden_dim, args.learning_rate) - - # ------------ - # training - # ------------ - trainer = pl.Trainer.from_argparse_args(args) - trainer.fit(model, train_loader, val_loader) - - # ------------ - # testing - # ------------ - trainer.test(test_dataloaders=test_loader) + pl.seed_everything(1234) + + # ------------ + # args + # ------------ + parser = ArgumentParser() + parser.add_argument('--batch_size', default=32, type=int) + parser.add_argument('--num_workers', default=1, type=int) + + parser = pl.Trainer.add_argparse_args(parser) + parser = LitClassifier.add_model_specific_args(parser) + args = parser.parse_args() + + # ------------ + # data + # ------------ + mnist = MNISTDataModule('') + + # ------------ + # model + # ------------ + model = LitClassifier(args.hidden_dim, args.learning_rate) + + # ------------ + # training + # ------------ + trainer = pl.Trainer.from_argparse_args(args) + trainer.fit(model, datamodule=mnist) + + # ------------ + # testing + # ------------ + trainer.test(datamodule=mnist) if __name__ == '__main__': - cli_main() + cli_main() From 5985a6616ba4cae238728917801c5618bde3a125 Mon Sep 17 00:00:00 2001 From: knoriy Date: Fri, 26 Aug 2022 12:11:01 +0000 Subject: [PATCH 4/7] bug where using multiple nodes would linger when initiating gpus --- slurm.sh | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/slurm.sh b/slurm.sh index 4d51d2b..17414a4 100644 --- a/slurm.sh +++ b/slurm.sh @@ -1,8 +1,10 @@ #!/bin/bash -#SBATCH --partition=compute-od-gpu -#SBATCH --job-name=pl_test -#SBATCH --nodes=10 +#SBATCH --partition=gpu +#SBATCH --job-name=multinode_pl_test +#SBATCH --nodes=2 #SBATCH --exclusive +#SBATCH --comment clap +#SBATCH --ntasks-per-node=8 #SBATCH --output=%x_%j.out export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/nccl/build/lib:/opt/aws-ofi-nccl-install/lib @@ -18,5 +20,6 @@ export FI_EFA_ENABLE_SHM_TRANSFER=0 export FI_PROVIDER=efa export FI_EFA_TX_MIN_CREDITS=64 export NCCL_TREE_THRESHOLD=0 +export NCCL_SOCKET_IFNAME=^docker0,lo -srun /home/knoriy/fsx/miniconda3/envs/pl/bin/python /home/knoriy/deep-learning-project-template/project/lit_mnist.py --accelerator gpu --strategy ddp +srun --comment clap /home/knoriy/fsx/miniconda3/envs/clasp/bin/python /home/knoriy/deep-learning-project-template/project/lit_mnist.py --accelerator gpu --strategy ddp --num_nodes 2 --devices 8 From 92d87c36e890b189855a4c058b3ea54d77bb4a53 Mon Sep 17 00:00:00 2001 From: Kari Noriy Date: Fri, 2 Sep 2022 11:33:22 +0100 Subject: [PATCH 5/7] updated requirements --- requirements.txt | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index 30840f0..8fb2e0c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ -pytorch-lightning >= 1.0.0rc2 -torch >= 1.3.0 -torchvision >= 0.6.0 +pytorch-lightning >= 1.7.2 +torch +torchvision +torchaudio From 4905f793625a5c55e69333947b38abeae9a8eb53 Mon Sep 17 00:00:00 2001 From: Kari Noriy Date: Fri, 2 Sep 2022 11:38:45 +0100 Subject: [PATCH 6/7] Added runscripts for gpu and multi gpu --- README.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 63571ba..fb6c827 100644 --- a/README.md +++ b/README.md @@ -54,7 +54,13 @@ pip install -r requirements.txt cd project # run module (example: mnist as your main contribution) -python lit_classifier_main.py +python lit_mnist.py + +# train on GPU +python lit_mnist.py --max_epochs 100 --accelerator gpu + +# Multi GPU' +python lit_mnist.py --max_epochs 100 --accelerator gpu --strategy ddp --devices 2 ``` ## Imports From 76c1be4afc2d8323d7f3d522a51de32c0997c222 Mon Sep 17 00:00:00 2001 From: knoriy Date: Wed, 20 Sep 2023 12:05:33 +0000 Subject: [PATCH 7/7] updated to to use lightning cli --- .gitignore | 2 ++ README.md | 33 +++------------------------------ config/config.yaml | 31 +++++++++++++++++++++++++++++++ project/train.py | 17 +++++++++++++++++ 4 files changed, 53 insertions(+), 30 deletions(-) create mode 100644 config/config.yaml create mode 100644 project/train.py diff --git a/.gitignore b/.gitignore index 06f9346..e607e4d 100644 --- a/.gitignore +++ b/.gitignore @@ -127,3 +127,5 @@ venv.bak/ lightning_logs/ MNIST .DS_Store + +logs/ \ No newline at end of file diff --git a/README.md b/README.md index fb6c827..b01af78 100644 --- a/README.md +++ b/README.md @@ -50,39 +50,12 @@ pip install -r requirements.txt ``` Next, navigate to any file and run it. ```bash -# module folder -cd project - # run module (example: mnist as your main contribution) -python lit_mnist.py - -# train on GPU -python lit_mnist.py --max_epochs 100 --accelerator gpu - -# Multi GPU' -python lit_mnist.py --max_epochs 100 --accelerator gpu --strategy ddp --devices 2 +python project/train.py fit --config ../config/config.yaml ``` -## Imports -This project is setup as a package which means you can now easily import any file into any other file like so: -```python -from project.datasets.mnist import mnist -from project.lit_classifier_main import LitClassifier -from pytorch_lightning import Trainer - -# model -model = LitClassifier() - -# data -train, val, test = mnist() - -# train -trainer = Trainer() -trainer.fit(model, train, val) - -# test using the best model! -trainer.test(test_dataloaders=test) -``` +## Config +If you would like to learn more about the lightning cli please head over to the [LighningCLI](https://lightning.ai/docs/pytorch/stable/cli/lightning_cli.html) docs. ### Citation ``` diff --git a/config/config.yaml b/config/config.yaml new file mode 100644 index 0000000..de4ad62 --- /dev/null +++ b/config/config.yaml @@ -0,0 +1,31 @@ +# pytorch_lightning==2.0.1 +seed_everything: 1234 +trainer: + accelerator: auto + devices: auto + num_nodes: 1 + precision: 32-true + logger: + - class_path: pytorch_lightning.loggers.WandbLogger + init_args: + name: lit_mnist_logs + save_dir: logs + project: pl_template + callbacks: + - class_path: pytorch_lightning.callbacks.ModelCheckpoint + init_args: + verbose: True + - class_path: pytorch_lightning.callbacks.LearningRateMonitor + fast_dev_run: false + max_epochs: 100 +model: + class_path: lit_mnist.LitClassifier + init_args: + hidden_dim: 128 + learning_rate: 0.001 +data: + class_path: lit_mnist.MNISTDataModule + init_args: + data_dir: '' + batch_size: 16 +ckpt_path: null diff --git a/project/train.py b/project/train.py new file mode 100644 index 0000000..40006c5 --- /dev/null +++ b/project/train.py @@ -0,0 +1,17 @@ +import logging +import torch +import pytorch_lightning as pl +from pytorch_lightning.cli import LightningCLI + +pl_logger = logging.getLogger('pytorch_lightning') + +if __name__ == '__main__': + import datetime + pl_logger.info(f"Starting at {datetime.datetime.now()}") + + torch.set_float32_matmul_precision('medium') + + cli = LightningCLI( + trainer_class=pl.Trainer, + save_config_callback=None, + )