Create package

kinit-sk · Jan 17, 2025 · 7a993bc · 7a993bc
1 parent f111e2a
commit 7a993bc
Show file tree

Hide file tree

Showing 25 changed files with 134 additions and 116 deletions.
diff --git a/main.py b/main.py
@@ -6,7 +6,7 @@
 
 from misc import (get_model_size, init_dataset, init_model, supported_datasets,
                   supported_models)
-from optimizers import optimizers_map
+from misc import optimizers_map
 from train import OvershootTrainer
 from trainer_configs import get_trainer_config
 

diff --git a/misc.py b/misc.py
@@ -1,4 +1,4 @@
-from typing import List, Optional
+from typing import Optional
 
 import numpy as np
 from peft import LoraConfig, TaskType, get_peft_model
@@ -18,6 +18,15 @@
 from models.vae import VAE
 from trainer_configs import *
 
+from overshoot.sgd_overshoot import SGDO
+from overshoot.adamw_overshoot_delayed import AdamO as OvershootAdamW_delayed
+
+from optimizers_old.backups2.sgdo_adaptive import SGDO as SGDO_adaptive
+from optimizers_old.backups2.adamw_overshoot_replication import AdamW as OvershootAdamW_replication
+from optimizers_old.backups2.adamw_overshoot_full_approximation import AdamW as OvershootAdamW_full_approximation
+from optimizers_old.backups2.adamw_overshoot_denom_approximation import AdamW as OvershootAdamW_denom_approximation
+from optimizers_old.backups2.adamw_overshoot_adaptive import AdamW as OvershootAdamW_adaptive
+
 supported_datasets = [
     "mnist",
     "f-mnist",
@@ -41,6 +50,25 @@
     "minilm",
 ]
 
+optimizers_map = {
+    "sgd": torch.optim.SGD,
+    "sgd_momentum": torch.optim.SGD,
+    "sgd_nesterov": torch.optim.SGD,
+    "sgd_overshoot": SGDO,
+    "sgd_adaptive": SGDO_adaptive,
+    "adam": torch.optim.Adam,
+    "adamW": torch.optim.AdamW,
+    "adam_zero": torch.optim.Adam,
+    "adamW_zero": torch.optim.AdamW,
+    "nadam": torch.optim.NAdam,
+    "adamW_overshoot_replication": OvershootAdamW_replication,
+    "adamW_overshoot_full_approximation": OvershootAdamW_full_approximation,
+    "adamW_overshoot_denom_approximation": OvershootAdamW_denom_approximation,
+    "adamW_overshoot_delayed": OvershootAdamW_delayed,
+    "adamW_overshoot_adaptive": OvershootAdamW_adaptive,
+    "rmsprop": torch.optim.RMSprop,
+}
+
 
 def init_dataset(dataset_name: str, model_name: Optional[str], seed: Optional[int] = None):
     if dataset_name == "mnist":
@@ -196,3 +224,82 @@ def get_model_size(model: torch.nn.Module):
     buffer_size = sum(p.numel() for p in model.buffers()) * 4
     size_all_mb = (param_size + buffer_size) / 1024 / 1024
     return round(size_all_mb, 2)
+
+def create_optimizer(opt_name: str, param_groups, overshoot_factor: float, lr: float, config, foreach: Optional[bool] = None) -> torch.optim.Optimizer:
+    if opt_name == "nadam":
+        opt = optimizers_map[opt_name](
+            param_groups,
+            lr=lr,
+            betas=(config.adam_beta1, config.adam_beta2),
+            momentum_decay=1000000000000000000000000, # Turn of momentum decay
+            weight_decay=config.weight_decay,
+            decoupled_weight_decay=True,
+            foreach=foreach,
+        )
+    elif opt_name == "adamW_overshoot_delayed":
+        opt = optimizers_map[opt_name](
+            param_groups,
+            lr=lr,
+            betas=(config.adam_beta1, config.adam_beta2),
+            weight_decay=config.weight_decay,
+            overshoot=overshoot_factor,
+            overshoot_delay=config.overshoot_delay,
+            foreach=foreach
+        )
+    elif opt_name == "adamW_overshoot_adaptive":
+        opt = optimizers_map[opt_name](
+            param_groups,
+            lr=lr,
+            betas=(config.adam_beta1, config.adam_beta2),
+            weight_decay=config.weight_decay,
+            cosine_target=config.target_cosine_similarity,
+            foreach=foreach
+        )
+    elif opt_name.startswith("adamW_overshoot"):
+        opt = optimizers_map[opt_name](
+            param_groups,
+            lr=lr,
+            betas=(config.adam_beta1, config.adam_beta2),
+            weight_decay=config.weight_decay,
+            overshoot=overshoot_factor,
+            foreach=foreach,
+        )
+    elif "adam" in opt_name:
+        config.adam_beta1 *= "zero" not in opt_name
+        opt = optimizers_map[opt_name](
+            param_groups,
+            lr=lr,
+            betas=(config.adam_beta1, config.adam_beta2),
+            weight_decay=config.weight_decay,
+            foreach=foreach,
+        )
+    elif "sgd_adaptive" in opt_name:
+        opt = optimizers_map[opt_name](
+            param_groups,
+            lr=lr,
+            momentum=config.sgd_momentum,
+            weight_decay=config.weight_decay,
+            cosine_target=config.target_cosine_similarity,
+            foreach=foreach,
+        )
+    elif "sgd_overshoot" in opt_name:
+        opt = optimizers_map[opt_name](
+            param_groups,
+            lr=lr,
+            momentum=config.sgd_momentum,
+            weight_decay=config.weight_decay,
+            overshoot=overshoot_factor,
+            foreach=foreach,
+        )
+    elif "sgd" in opt_name:
+        opt = optimizers_map[opt_name](
+            param_groups,
+            lr=lr,
+            momentum=0 if opt_name == "sgd" else config.sgd_momentum,
+            weight_decay=config.weight_decay,
+            nesterov="nesterov" in opt_name,
+            foreach=foreach,
+        )
+    else:
+        raise Exception(f"Optimizer {opt_name} not recognized.")
+    return opt
diff --git a/optimizers/__init__.py b/optimizers/__init__.py
diff --git a/...rshoot_fast_my_calculations_const_bias.py → ...rshoot_fast_my_calculations_const_bias.py b/...rshoot_fast_my_calculations_const_bias.py → ...rshoot_fast_my_calculations_const_bias.py
diff --git a/...t_my_calculations_const_bias_and_denom.py → ...t_my_calculations_const_bias_and_denom.py b/...t_my_calculations_const_bias_and_denom.py → ...t_my_calculations_const_bias_and_denom.py
diff --git a/...shoot_fast_my_calculations_const_denom.py → ...shoot_fast_my_calculations_const_denom.py b/...shoot_fast_my_calculations_const_denom.py → ...shoot_fast_my_calculations_const_denom.py
diff --git a/...rshoot_fast_my_calculations_slow_start.py → ...rshoot_fast_my_calculations_slow_start.py b/...rshoot_fast_my_calculations_slow_start.py → ...rshoot_fast_my_calculations_slow_start.py
diff --git a/...damw_overshoot_fast_no_bias_correction.py → ...damw_overshoot_fast_no_bias_correction.py b/...damw_overshoot_fast_no_bias_correction.py → ...damw_overshoot_fast_no_bias_correction.py
diff --git a/...ers/backups/adamw_overshoot_from_nadam.py → ...old/backups/adamw_overshoot_from_nadam.py b/...ers/backups/adamw_overshoot_from_nadam.py → ...old/backups/adamw_overshoot_from_nadam.py
diff --git a/...ers/backups/adamw_overshoot_old_backup.py → ...old/backups/adamw_overshoot_old_backup.py b/...ers/backups/adamw_overshoot_old_backup.py → ...old/backups/adamw_overshoot_old_backup.py
diff --git a/...shoot_replication_with_memory_overhead.py → ...shoot_replication_with_memory_overhead.py b/...shoot_replication_with_memory_overhead.py → ...shoot_replication_with_memory_overhead.py
diff --git a/...ackups/nadam_with_overshoot_old_backup.py → ...ackups/nadam_with_overshoot_old_backup.py b/...ackups/nadam_with_overshoot_old_backup.py → ...ackups/nadam_with_overshoot_old_backup.py
diff --git a/optimizers/backups/rmsprop.py → optimizers_old/backups/rmsprop.py b/optimizers/backups/rmsprop.py → optimizers_old/backups/rmsprop.py
diff --git a/optimizers/backups/sgd_old.py → optimizers_old/backups/sgd_old.py b/optimizers/backups/sgd_old.py → optimizers_old/backups/sgd_old.py
diff --git a/...zers/backups2/adamw_overshoot_adaptive.py → ..._old/backups2/adamw_overshoot_adaptive.py b/...zers/backups2/adamw_overshoot_adaptive.py → ..._old/backups2/adamw_overshoot_adaptive.py
diff --git a/...s2/adamw_overshoot_denom_approximation.py → ...s2/adamw_overshoot_denom_approximation.py b/...s2/adamw_overshoot_denom_approximation.py → ...s2/adamw_overshoot_denom_approximation.py
diff --git a/...ps2/adamw_overshoot_full_approximation.py → ...ps2/adamw_overshoot_full_approximation.py b/...ps2/adamw_overshoot_full_approximation.py → ...ps2/adamw_overshoot_full_approximation.py
diff --git a/...s/backups2/adamw_overshoot_replication.py → ...d/backups2/adamw_overshoot_replication.py b/...s/backups2/adamw_overshoot_replication.py → ...d/backups2/adamw_overshoot_replication.py
diff --git a/optimizers/backups2/sgdo_adaptive.py → optimizers_old/backups2/sgdo_adaptive.py b/optimizers/backups2/sgdo_adaptive.py → optimizers_old/backups2/sgdo_adaptive.py
diff --git a/overshoot/__init__.py b/overshoot/__init__.py
diff --git a/optimizers/adamw_overshoot_delayed.py → overshoot/adamw_overshoot_delayed.py b/optimizers/adamw_overshoot_delayed.py → overshoot/adamw_overshoot_delayed.py
diff --git a/optimizers/sgd_overshoot.py → overshoot/sgd_overshoot.py b/optimizers/sgd_overshoot.py → overshoot/sgd_overshoot.py
@@ -442,7 +442,6 @@ def _fused_sgd(
             momentum=momentum,
             lr=lr,
             dampening=dampening,
-            nesterov=nesterov,
             maximize=maximize,
             is_first_step=is_first_step,
             grad_scale=device_grad_scale,

diff --git a/setup.py b/setup.py
@@ -0,0 +1,23 @@
+from setuptools import setup, find_packages
+
+setup(
+    name="overshoot",
+    version="0.1.0",
+    description="Overshoot version of SGD and AdamW optimizers",
+    long_description=open("README.md").read(),
+    long_description_content_type="text/markdown",
+    author="Jakub Kopal",
+    author_email="[email protected]",
+    url="https://github.com/kinit-sk/overshoot",
+    license="MIT",
+    packages=find_packages(),
+    install_requires=[
+        "torch>=2.4.0",
+    ],
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: MIT License",
+        "Operating System :: OS Independent",
+    ],
+    python_requires=">=3.9",
+)
diff --git a/train.py b/train.py
@@ -14,7 +14,7 @@
 from trainer_configs import DefaultConfig
 from custom_datasets import UnifiedDatasetInterface
 from misc import compute_model_distance, get_gpu_stats
-from optimizers import create_optimizer
+from misc import create_optimizer
 
 # ------------------------------------------------------------------------------
 torch.cuda.empty_cache()

diff --git a/train_with_pl.py b/train_with_pl.py
@@ -13,7 +13,7 @@
 from torch.nn import functional as F
 from torch.utils.data import DataLoader
 
-from optimizers import optimizers_map
+from optimizers.overshoot import optimizers_map
 
 
 from misc import init_dataset, init_model, get_gpu_stats, compute_model_distance, supported_datasets, supported_models