Introducing a generic ModelHandler interface.

balancap · balancap · commit 159f03b66a88 · 2025-02-06T16:48:16.000Z
This model handler interface should cover most cases in quantization, fused layer optimization, ...
diff --git a/docs/float8.md b/docs/float8.md
@@ -9,7 +9,7 @@ Launch training job with the following command (or alternatively set configs in
 ```
 CONFIG_FILE="./train_configs/llama3_8b.toml" ./run_llama_train.sh --float8.enable_float8_linear --float8.enable_fsdp_float8_all_gather --float8.precompute_float8_dynamic_scale_for_fsdp
 ```
-* `--float8.enable_float8_linear`: swap `nn.Linear` with `Float8Linear` to perform float8 matmul.
+<!-- * `--float8.enable_float8_linear`: swap `nn.Linear` with `Float8Linear` to perform float8 matmul. -->
 * `--float8.enable_fsdp_float8_all_gather`: cast `Float8Linear.weight` from high precision to float8 before FSDP all-gather so we can communicate in float8 to save bandwidth.
 * `--float8.precompute_float8_dynamic_scale_for_fsdp` (optional): communicate AMAX/scales efficiently in a single all-reduce for all parameters instead of doing many small all-reduce for each parameter.
 
diff --git a/torchtitan/config_manager.py b/torchtitan/config_manager.py
@@ -182,6 +182,12 @@ def __init__(self):
             default="./torchtitan/datasets/tokenizer/tokenizer.model",
             help="Tokenizer path",
         )
+        self.parser.add_argument(
+            "--model.handlers",
+            type=str,
+            default="",
+            help="Comma separated list of handlers to apply to the model (e.g. 'float8')",
+        )
 
         # optimizer configs
         self.parser.add_argument(
@@ -529,15 +535,15 @@ def __init__(self):
         )
 
         # float8 configs
-        self.parser.add_argument(
-            "--float8.enable_float8_linear",
-            action="store_true",
-            help="""
-                If true, swaps `torch.nn.Linear` with `Float8Linear`.
-                This feature requires you to install 'torchao' which can be found
-                here: https://github.com/pytorch/ao
-            """,
-        )
+        # self.parser.add_argument(
+        #     "--float8.enable_float8_linear",
+        #     action="store_true",
+        #     help="""
+        #         If true, swaps `torch.nn.Linear` with `Float8Linear`.
+        #         This feature requires you to install 'torchao' which can be found
+        #         here: https://github.com/pytorch/ao
+        #     """,
+        # )
         self.parser.add_argument(
             "--float8.enable_fsdp_float8_all_gather",
             action="store_true",
diff --git a/torchtitan/float8.py b/torchtitan/float8.py
@@ -20,6 +20,7 @@
 
 from torchtitan.config_manager import JobConfig
 from torchtitan.logging import logger
+from torchtitan.model_handler import ModelHandler, register_model_handler
 from torchtitan.parallelisms import ParallelDims
 
 
@@ -28,13 +29,11 @@ def _is_sm89_or_later():
     return torch.cuda.is_available() and torch.cuda.get_device_capability() >= (8, 9)
 
 
-class Float8Handler:
+class Float8Handler(ModelHandler):
     def __init__(self, job_config: JobConfig, parallel_dims: ParallelDims):
         self.enabled = False
 
         float8_config = job_config.float8
-        if not float8_config.enable_float8_linear:
-            return
         if not _is_sm89_or_later():
             logger.warning(
                 "Failed to swap to Float8Linear because float8 is only supported on SM89 or later",
@@ -66,6 +65,12 @@ def __init__(self, job_config: JobConfig, parallel_dims: ParallelDims):
 
         logger.info("Float8 training active")
 
+    def convert(self, model: nn.Module):
+        return self.convert_to_float8_training(model)
+
+    def post_optimizer_hook(self, model: Union[nn.Module, List[nn.Module]]):
+        return self.precompute_float8_dynamic_scale_for_fsdp(model)
+
     def convert_to_float8_training(self, model: nn.Module):
         """
         This function converts the linear layers of `model` to `Float8Linear`.
@@ -102,3 +107,6 @@ def precompute_float8_dynamic_scale_for_fsdp(
         models = [model] if isinstance(model, nn.Module) else model
         for m in models:
             precompute_float8_dynamic_scale_for_fsdp(m)
+
+
+register_model_handler(Float8Handler, "float8")
diff --git a/torchtitan/model_handler.py b/torchtitan/model_handler.py
@@ -0,0 +1,86 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Dict, List, Protocol, Union
+
+import torch.nn as nn
+
+from torchtitan.config_manager import JobConfig
+from torchtitan.parallelisms import ParallelDims
+
+
+class ModelHandler(Protocol):
+    """General model handler interface.
+
+    A model handler is applying a modification to PyTorch model.
+    Typical use cases are:
+        - Quantization: using QAT, FP8, ... specialized linear layers;
+        - Fused optimized layers (e.g. flash-attention, norms, ...)
+    """
+
+    def __init__(self, job_config: JobConfig, parallel_dims: ParallelDims):
+        ...
+
+    def convert(self, model: nn.Module):
+        """Inplace convertion of the model."""
+        ...
+
+    def post_optimizer_hook(self, model: Union[nn.Module, List[nn.Module]]):
+        """Post-optimizer (optional) hook (e.g. compute weights statistics)."""
+        ...
+
+
+_registry_model_handler_cls: Dict[str, type[ModelHandler]] = {}
+"""Registry of model handler classes.
+"""
+
+
+def register_model_handler(handler_cls: type[ModelHandler], name: str):
+    """Register a model handler class.
+
+    A registered model handler can be applied on any TorchTitan model
+    using the `model.handlers` config parameter.
+    """
+    assert (
+        name not in _registry_model_handler_cls
+    ), f"A TorchTitan model handler '{name}' is already registered."
+    _registry_model_handler_cls[name] = handler_cls
+
+
+class ModelHandlersContainer(ModelHandler):
+    """Model handlers sequential container.
+
+    The class build the sequence of model handlers defined in `model.handlers`
+    job config, and apply them to the model sequentially.
+    """
+
+    def __init__(self, job_config: JobConfig, parallel_dims: ParallelDims):
+        handler_names = parse_model_handlers(job_config)
+        handler_classes = [_registry_model_handler_cls[name] for name in handler_names]
+        self.handlers = [
+            mh_cls(job_config, parallel_dims) for mh_cls in handler_classes
+        ]
+
+    def convert(self, model: nn.Module):
+        for mh in self.handlers:
+            mh.convert(model)
+
+    def post_optimizer_hook(self, model: Union[nn.Module, List[nn.Module]]):
+        for mh in self.handlers:
+            mh.post_optimizer_hook(model)
+
+
+def parse_model_handlers(job_config: JobConfig) -> List[str]:
+    """Parse the list of model handlers to apply."""
+    handler_names = [v.strip() for v in job_config.model.handlers.split(",")]
+    handler_names = [v for v in handler_names if len(v) > 0]
+    return handler_names
+
+
+def build_model_handlers_container(
+    job_config: JobConfig, parallel_dims: ParallelDims
+) -> ModelHandlersContainer:
+    """Build the collection of model handlers to apply to the model."""
+    return ModelHandlersContainer(job_config, parallel_dims)
diff --git a/torchtitan/parallelisms/parallelize_llama.py b/torchtitan/parallelisms/parallelize_llama.py
@@ -34,7 +34,7 @@
 from torchtitan.config_manager import JobConfig, TORCH_DTYPE_MAP
 from torchtitan.logging import logger
 from torchtitan.parallelisms.parallel_dims import ParallelDims
-
+from torchtitan.model_handler import parse_model_handlers
 
 def parallelize_llama(
     model: nn.Module,
@@ -56,11 +56,12 @@ def parallelize_llama(
             and not job_config.training.compile
         ):
             raise RuntimeError("Async TP requires --training.compile")
+        enable_float8 = "float8" in parse_model_handlers(job_config)
         apply_tp(
             model,
             world_mesh["tp"],
             loss_parallel=parallel_dims.loss_parallel_enabled,
-            enable_float8=job_config.float8.enable_float8_linear,
+            enable_float8=enable_float8,
             enable_async_tp=job_config.experimental.enable_async_tensor_parallel,
         )
 
diff --git a/train.py b/train.py
@@ -16,9 +16,9 @@
 from torchtitan.checkpoint import CheckpointManager, TrainState
 from torchtitan.config_manager import JobConfig
 from torchtitan.datasets import build_hf_data_loader, build_tokenizer
-from torchtitan.float8 import Float8Handler
 from torchtitan.logging import init_logger, logger
 from torchtitan.metrics import build_device_memory_monitor, build_metric_logger
+from torchtitan.model_handler import build_model_handlers_container
 from torchtitan.models import model_name_to_cls, model_name_to_tokenizer, models_config
 from torchtitan.optimizer import build_lr_schedulers, build_optimizers
 from torchtitan.parallelisms import (
@@ -110,10 +110,9 @@ def main(job_config: JobConfig):
     with torch.device("meta"):
         model = model_cls.from_model_args(model_config)
 
-    # a no-op hander if float8 is not enabled
-    float8_handler = Float8Handler(job_config, parallel_dims)
-    # swap to Float8Linear based on float8 configs
-    float8_handler.convert_to_float8_training(model)
+    # Build the collection of model handlers. No-op if `model.handlers` empty
+    model_handlers = build_model_handlers_container(job_config, parallel_dims)
+    model_handlers.convert(model)
 
     # log model size
     model_param_count = utils.get_num_params(model)
@@ -326,9 +325,10 @@ def loss_fn(pred, labels):
             optimizers.step()
             lr_schedulers.step()
 
-            # calculate float8 dynamic amax/scale for all-parameter for FSDP2
+            # Post-optimizer model handlers hook.
+            # e.g. calculate float8 dynamic amax/scale for all-parameter for FSDP2
             # it issues a single all-reduce for all parameters at once for better performance
-            float8_handler.precompute_float8_dynamic_scale_for_fsdp(model_parts)
+            model_handlers.post_optimizer_hook(model_parts)
 
             # log metrics
             if (
diff --git a/train_configs/debug_model.toml b/train_configs/debug_model.toml
@@ -26,6 +26,7 @@ flavor = "debugmodel"
 norm_type = "rmsnorm"  # layernorm / np_layernorm / rmsnorm / fused_rmsnorm
 # test tokenizer.model, for debug purpose only
 tokenizer_path = "./tests/assets/test_tiktoken.model"
+handlers = ""
 
 [optimizer]
 name = "AdamW"
@@ -62,4 +63,3 @@ mode = 'selective'  # ['none', 'selective', 'full']
 selective_ac_option = '2'  # 'int' = ac every positive int layer or 'op', ac based on ops policy
 
 [float8]
-enable_float8_linear = false
diff --git a/train_configs/llama3_405b.toml b/train_configs/llama3_405b.toml
@@ -20,6 +20,7 @@ name = "llama3"
 flavor = "405B"
 norm_type = "rmsnorm"  # layernorm / np_layernorm / rmsnorm / fused_rmsnorm
 tokenizer_path = "./torchtitan/datasets/tokenizer/original/tokenizer.model"
+handlers = "float8"
 
 [optimizer]
 name = "AdamW"
@@ -55,6 +56,5 @@ async_mode = "disabled" # ["disabled", "async", "async_with_pinned_mem"]
 mode = 'full' # ['none', 'selective', 'full']
 
 [float8]
-enable_float8_linear = true
 enable_fsdp_float8_all_gather = true
 precompute_float8_dynamic_scale_for_fsdp = true
diff --git a/train_configs/llama3_70b.toml b/train_configs/llama3_70b.toml
@@ -20,6 +20,7 @@ name = "llama3"
 flavor = "70B"
 norm_type = "rmsnorm"  # layernorm / np_layernorm / rmsnorm / fused_rmsnorm
 tokenizer_path = "./torchtitan/datasets/tokenizer/original/tokenizer.model"
+handlers = ""
 
 [optimizer]
 name = "AdamW"
@@ -54,4 +55,3 @@ async_mode = "disabled" # ["disabled", "async", "async_with_pinned_mem"]
 mode = 'full'
 
 [float8]
-enable_float8_linear = false
diff --git a/train_configs/llama3_8b.toml b/train_configs/llama3_8b.toml
@@ -20,6 +20,7 @@ name = "llama3"
 flavor = "8B"
 norm_type = "rmsnorm"  # layernorm / np_layernorm / rmsnorm / fused_rmsnorm
 tokenizer_path = "./torchtitan/datasets/tokenizer/original/tokenizer.model"
+handlers = ""
 
 [optimizer]
 name = "AdamW"
@@ -55,4 +56,3 @@ mode = 'selective'  # ['none', 'selective', 'full']
 selective_ac_option = 'op'  # 'int' = ac every positive int layer or 'op', ac based on ops policy
 
 [float8]
-enable_float8_linear = false