Docs: fix docstring errors in ddp_comm_hooks (pytorch#116866)

mingxzhao · pytorchmergebot · commit b4f1ab4505f4 · 2024-01-10T01:24:06.000Z
Reopens pytorch#115272 Fixes ddp_comm_hooks errors in pytorch#112644 Pull Request resolved: pytorch#116866 Approved by: https://github.com/awgu
diff --git a/torch/distributed/algorithms/ddp_comm_hooks/__init__.py b/torch/distributed/algorithms/ddp_comm_hooks/__init__.py
@@ -25,6 +25,8 @@ def _powerSGD_comm_hook_wrapper(
     start_powerSGD_iter=1_000,
 ):
     """
+    Wrap PowerSGD communication hook.
+
     To be consistent with the wrappers of other DDP comm hooks, the input state only needs to be a process group,
     which will be wrapped up with other state info.
     """
@@ -38,6 +40,8 @@ def _powerSGD_comm_hook_wrapper(
 
 class DDPCommHookType(Enum):
     """
+    Enumerate ``ddp_comm_hooks`` and ``ddp_comm_hook_wrapper`` communucation hook types.
+
     DDPCommHookType enumerates the hooks of ``torch.distributed.algorithms.ddp_comm_hooks``
     as names and ``ddp_comm_hook_wrapper`` partials with hook specified. As an example,
     you can register allreduce hook by
@@ -89,6 +93,8 @@ def register_ddp_comm_hook(
     comm_hook_type: DDPCommHookType, model, state=None
 ):
     """
+    Register ``ddp_comm_hooks`` to DDP model.
+
     Registers the hooks of ``torch.distributed.algorithms.ddp_comm_hooks``
     to the DDP model. User can specify the type of hook as an enum
     ``DDPCommHookType`` type using ``comm_hook_type`` input. State input will
diff --git a/torch/distributed/algorithms/ddp_comm_hooks/ddp_zero_hook.py b/torch/distributed/algorithms/ddp_comm_hooks/ddp_zero_hook.py
@@ -24,7 +24,7 @@ def _perform_local_step(
     rank: int,
 ):
     r"""
-    Performs a local optimizer step using the gradients provided by ``bucket``.
+    Perform a local optimizer step using the gradients provided by ``bucket``.
 
     Arguments:
         bucket (dist.GradBucket): the bucket providing the gradients.
@@ -98,10 +98,10 @@ def _save_ddp_bucket_info(
     zero: ZeroRedundancyOptimizer,
 ):
     r"""
-    Saves :class:`DistributedDataParallel` gradient bucket information for the
-    :class:`ZeroRedundancyOptimizer` instance ``zero`` to use when overlapping.
+    Save :class:`DistributedDataParallel` gradient bucket information for :class:`ZeroRedundancyOptimizer` instance ``zero``.
+
     In particular, this function is meant to be called upon seeing each
-    gradient bucket, meaning it does not save or compute any global
+    gradient bucket to use when overlapping, meaning it does not save or compute any global
     information.
 
     Arguments:
@@ -130,8 +130,9 @@ def _hook_with_zero_step_setup(
     bucket: dist.GradBucket,
 ):
     r"""
-    Encapsulates the setup logic for :func:`hook_with_zero_step` and
-    :func:`hook_with_zero_step_interleaved`, meaning the logic to run in the
+    Encapsulate the setup logic for :func:`hook_with_zero_step` and :func:`hook_with_zero_step_interleaved`.
+
+    This means the logic to run in the
     hook before the backward pass and optimizer step can actually be
     overlapped. This is factored out since it is common to both
     :func:`hook_with_zero_step` and :func:`hook_with_zero_step_interleaved`.
@@ -172,16 +173,14 @@ def hook_with_zero_step(
     shard_buckets: bool = False,
 ) -> Callable[[Any, dist.GradBucket], torch.futures.Future[torch.Tensor]]:
     r"""
-    Modifies the given ``hook`` to overlap the :class:`ZeroRedundancyOptimizer`
-    optimizer step with the :class:`DistributedDataParallel` backward pass,
-    where the optimizer step computation begins after the last gradient bucket
-    computation has finished.
+    Modify ``hook`` to overlap :class:`ZeroRedundancyOptimizer` optimizer step with :class:`DistributedDataParallel` backward pass.
 
     This approach overlaps the optimizer computation and communication with the
     backward communication. In particular, the backward computation proceeds
     contiguously, and the optimizer computation follows, overlapping with
     outstanding backward communication (i.e. all-reduces) and possibly other
     optimizer communication (i.e. broadcasts).
+    The optimizer step computation begins after the last gradient bucket computation has finished.
 
     This approach may be preferred over :meth:`hook_with_zero_step_interleaved`
     if communication is relatively slow compared to computation.
@@ -244,11 +243,11 @@ def hook_with_zero_fn(
         bucket: dist.GradBucket,
     ) -> torch.futures.Future[torch.Tensor]:
         r"""
-        Returns a :class:`Future` that gives a gradient bucket tensor and
-        performs the equivalent of a :class:`ZeroRedundancyOptimizer`
-        :meth:`step` if ``bucket`` is the last gradient bucket.
+        Return :class:`Future` that runs the optimizer step if this corresponds to the last gradient bucket.
 
-        The function performs additional computation on the iteration that
+        Perform equivalent of :class:`ZeroRedundancyOptimizer` :meth:`step` if ``bucket`` is last gradient bucket.
+        The function gives a gradient bucket tensor and
+        performs additional computation on the iteration that
         the :class:`DistributedDataParallel` buckets are rebuilt to collect
         information used to implement the modified hook.
 
@@ -331,10 +330,7 @@ def hook_with_zero_step_interleaved(
     shard_buckets: bool = False,
 ) -> Callable[[Any, dist.GradBucket], torch.futures.Future[torch.Tensor]]:
     r"""
-    Modifies the given ``hook`` to overlap the :class:`ZeroRedundancyOptimizer`
-    optimizer step with the :class:`DistributedDataParallel` backward pass,
-    where the optimizer step computation interleaves with the backward
-    computation.
+    Modify ``hook`` to overlap :class:`ZeroRedundancyOptimizer` optimizer step with :class:`DistributedDataParallel` backward pass
 
     This approach overlaps the optimizer computation and communication with the
     backward computation and communication. In particular, once a bucket's
@@ -404,9 +400,11 @@ def hook_with_zero_interleaved_fn(
         bucket: dist.GradBucket,
     ) -> torch.futures.Future[torch.Tensor]:
         r"""
-        Returns a :class:`Future` that gives a gradient bucket tensor and
-        performs a partial :class:`ZeroRedundancyOptimizer` :meth:`step` using
-        the gradients in that bucket.
+        Return :class:`Future` that gives gradient bucket tensor and performs partial :class:`ZeroRedundancyOptimizer` :meth:`step`.
+
+        This function uses the gradients in gradient in given bucket to perform a partial
+        :class:`ZeroRedundancyOptimizer` :meth:`step`
+
         Arguments:
             state: any state for the hook.
             bucket (dist.GradBucket): the :class:`DistributedDataParallel`
@@ -419,9 +417,7 @@ def hook_with_zero_interleaved_fn(
 
         def zero_step(fut: torch.futures.Future) -> torch.Tensor:
             r"""
-            Performs a partial :class:`ZeroRedundancyOptimizer` :meth:`step`
-            using the gradients in the given :class:`DistributedDataParallel`
-            gradient bucket.
+            Perform partial :class:`ZeroRedundancyOptimizer` :meth:`step` using gradients in the :class:`DistributedDataParallel`.
 
             Returns:
                 A :class:`torch.Tensor` representing the contents of the
diff --git a/torch/distributed/algorithms/ddp_comm_hooks/debugging_hooks.py b/torch/distributed/algorithms/ddp_comm_hooks/debugging_hooks.py
@@ -8,8 +8,7 @@
 
 def noop_hook(_: Any, bucket: GradBucket) -> torch.futures.Future[torch.Tensor]:
     """
-    This DDP communication hook returns a future that wraps the input,
-    so it is a noop that does not incur any communication overheads.
+    Return a future that wraps the input, so it is a no-op that does not incur any communication overheads.
 
     This hook should **only** be used for headroom analysis of allreduce optimization,
     instead of the normal gradient synchronization.
diff --git a/torch/distributed/algorithms/ddp_comm_hooks/default_hooks.py b/torch/distributed/algorithms/ddp_comm_hooks/default_hooks.py
@@ -8,7 +8,7 @@
 def _allreduce_fut(
     process_group: dist.ProcessGroup, tensor: torch.Tensor
 ) -> torch.futures.Future[torch.Tensor]:
-    "Averages the input gradient tensor by allreduce and returns a future."
+    """Average the input gradient tensor by allreduce and returns a future."""
     group_to_use = process_group if process_group is not None else dist.group.WORLD
 
     # Apply the division first to avoid overflow, especially for FP16.
@@ -25,9 +25,12 @@ def allreduce_hook(
     process_group: dist.ProcessGroup, bucket: dist.GradBucket
 ) -> torch.futures.Future[torch.Tensor]:
     """
-    This DDP communication hook just calls ``allreduce`` using ``GradBucket``
-    tensors. Once gradient tensors are aggregated across all workers, its ``then``
-    callback takes the mean and returns the result. If user registers this hook,
+    Call ``allreduce`` using ``GradBucket`` tensors.
+
+    Once gradient tensors are aggregated across all workers, its ``then``
+    callback takes the mean and returns the result.
+
+    If user registers this DDP communication hook,
     DDP results is expected to be same as the case where no hook was registered.
     Hence, this won't change behavior of DDP and user can use this as a reference
     or modify this hook to log useful information or any other purposes while
@@ -44,6 +47,8 @@ def fp16_compress_hook(
     process_group: dist.ProcessGroup, bucket: dist.GradBucket
 ) -> torch.futures.Future[torch.Tensor]:
     """
+    Compress by casting ``GradBucket`` to ``torch.float16`` divided by process group size.
+
     This DDP communication hook implements a simple gradient compression
     approach that casts ``GradBucket`` tensor to half-precision floating-point format (``torch.float16``)
     and then divides it by the process group size.
@@ -113,10 +118,11 @@ def fp16_compress_wrapper(
     hook: Callable[[Any, dist.GradBucket], torch.futures.Future[torch.Tensor]]
 ) -> Callable[[Any, dist.GradBucket], torch.futures.Future[torch.Tensor]]:
     """
+    Cast input tensor to ``torch.float16``, cast result of hook back to input dtype.
+
     This wrapper casts the input gradient tensor of a given DDP communication hook to half-precision
     floating point format (``torch.float16``), and casts the resulting tensor of the given hook back to
     the input data type, such as ``float32``.
-
     Therefore, ``fp16_compress_hook`` is equivalent to ``fp16_compress_wrapper(allreduce_hook)``.
 
     Example::
diff --git a/torch/distributed/algorithms/ddp_comm_hooks/mixed_precision_hooks.py b/torch/distributed/algorithms/ddp_comm_hooks/mixed_precision_hooks.py
@@ -10,9 +10,11 @@
 class _AllreduceUpcastHookState:
     """
     State to manage DDP mixed precision in backward / gradient communication.
+
     This contains a weakref to the DDP module for access to reducer and process
     group, and a stream to run parameter and gradient upcasts.
     """
+
     ddp_weakref: Any
     upcast_stream: torch.cuda.Stream
     wait_for_stream_enqueued: bool = False
@@ -22,6 +24,8 @@ def _reducer_allreduce_and_upcast_hook(
     hook_state: _AllreduceUpcastHookState, bucket: dist.GradBucket
 ) -> torch.futures.Future[torch.Tensor]:
     """
+    Perform allreduce in precision ``reduce_dtype``, upcast to prepare for optimizer.
+
     Performs allreduce in the reduced precision given by DDP's mixed precision
     reduce_dtype, and upcasts parameters and gradients to fp32 in preparation
     to run the optimizer.
diff --git a/torch/distributed/algorithms/ddp_comm_hooks/optimizer_overlap_hooks.py b/torch/distributed/algorithms/ddp_comm_hooks/optimizer_overlap_hooks.py
@@ -13,6 +13,7 @@
 class _OptimizerHookState:
     """
     Holds state for running optimizer in-line after DDP communication hook.
+
     Currently contains only optimizer class which must have a method `step_param`.
     """
 
@@ -45,6 +46,8 @@ def _apply_optim_in_backward_hook(
     gradient_is_bucket_view: bool
 ) -> Callable[[Any, dist.GradBucket], torch.futures.Future[torch.Tensor]]:
     r"""
+    Register hook to apply the optimizer in backward.
+
     If torch.distributed.optim._apply_optimizer_in_backward is used to overlap
     optimizer with backward pass, DDP will run the below hook to run optimizer
     step for parameters after gradient communication has taken place.
@@ -123,9 +126,7 @@ def _hook_then_optimizer(
     hook: Callable[[Any, dist.GradBucket], torch.futures.Future[torch.Tensor]],
     optimizer_state: _OptimizerHookState,
 ) -> Callable[[Any, dist.GradBucket], torch.futures.Future[torch.Tensor]]:
-    r"""
-    Runs optimizer in a functional fashion after DDP communication hook.
-    """
+    r"""Run optimizer in a functional fashion after DDP communication hook."""
     has_set_params = (
         hasattr(optimizer_state, 'params_to_optimize')
         and optimizer_state.params_to_optimize is not None
diff --git a/torch/distributed/algorithms/ddp_comm_hooks/post_localSGD_hook.py b/torch/distributed/algorithms/ddp_comm_hooks/post_localSGD_hook.py
@@ -10,6 +10,8 @@
 
 class PostLocalSGDState:
     r"""
+    Store state for all-reducing gradients globally until given step, then locally after.
+
     Stores the state for all-reducing gradients globally using ``process_group`` until step ``start_localSGD_iter``,
     and all-reducing gradients locally using ``subgroup`` afterwards.
 
@@ -35,6 +37,7 @@ def __init__(
         start_localSGD_iter,
         post_local_gradient_allreduce=True,
     ):
+        """Initialize state object with given parameters and log when localSGD start."""
         logger.info(
             "Local SGD will be started after %s iterations", start_localSGD_iter
         )
@@ -51,6 +54,7 @@ def __init__(
         self.iter = 0
 
     def maybe_increase_iter(self, bucket):
+        """Track iterations and trigger log message at start of local SGD."""
         # Since bucket 0 is the last bucket to allreduce in an iteration.
         # Only increase `iter` when bucket 0 is processed.
         if bucket.is_last():
@@ -61,11 +65,12 @@ def maybe_increase_iter(self, bucket):
                 "Start to apply local SGD after %s iterations.", self.iter
             )
 
-
 def post_localSGD_hook(
     state: PostLocalSGDState, bucket: dist.GradBucket
 ) -> torch.futures.Future[torch.Tensor]:
     """
+    Run post-localSGD algorithm.
+
     This DDP communication hook is used for running post-localSGD algorithm,
     by combining with a model averaging component (e.g.,
     :class:`~torch.distributed.algorithms.model_averaging.averagers.PeriodicModelAverager`)
diff --git a/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py b/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py
@@ -19,6 +19,7 @@
 def _orthogonalize(matrices, epsilon=0):
     """
     Decide between Gram-Schmidt or QR factorization to orthogonalize a batch of matrices.
+
     QR factorization doesn't work with half-precision, but it is usually faster with a rank > 2.
     """
     assert len(matrices.shape) == 3 and matrices.shape[2] <= matrices.shape[1]
@@ -39,7 +40,8 @@ def _orthogonalize(matrices, epsilon=0):
 
 def _orthogonalize_gram_schmidt(matrices, epsilon=0):
     """
-    Applies Gram-Schmidt procedure to orthogonalize a batch of matrices.
+    Apply Gram-Schmidt procedure to orthogonalize a batch of matrices.
+
     If epsilon is 0, this is equivalent to `torch.qr(matrices, out=(matrices, _))`,
     """
     num_cols = matrices.shape[2]
@@ -73,6 +75,8 @@ def _should_compress(
     num_rows, num_cols, matrix_approximation_rank, min_compression_rate
 ):
     """
+    Recommend if tensor given is worth compressing.
+
     Returns a recommendation as to whether the 2D tensor described by the arguments is worth compressing,
     including statistics describing the expected savings from compression.  We consider a tensor worth
     compressing when ``min_compression_rate`` < uncompressed size / compressed size, where
@@ -97,9 +101,7 @@ def _should_compress(
 
 
 def _report_compression_stats(bucket, state):
-    """
-    Report compression stats at the frequency of `compression_stats_logging_frequency` specified in PowerSGD state.
-    """
+    """Report compression stats at frequency of ``compression_stats_logging_frequency`` specified in PowerSGD state."""
     if (
         bucket.is_last()
         and state.iter >= state.next_stats_report
@@ -114,7 +116,8 @@ def _report_compression_stats(bucket, state):
 
 class PowerSGDState:
     r"""
-    Stores both the algorithm's hyperparameters and the internal state for all the gradients during the training.
+    Store both the algorithm's hyperparameters and internal state for all gradients during training.
+
     Particularly, ``matrix_approximation_rank`` and ``start_powerSGD_iter`` are the main hyperparameters that should be tuned by the user.
     For performance, we suggest to keep binary hyperparameters ``use_error_feedback`` and ``warm_start`` on.
 
@@ -266,7 +269,8 @@ def __init__(
 
     def __getstate__(self):
         r"""
-        Returns a ``Dict[str, Any]`` which will be pickled and saved.
+        Return a ``Dict[str, Any]`` which will be pickled and saved.
+
         ``process_group`` is not serializable and excluded from
         a returned state.
         """
@@ -280,7 +284,8 @@ def __getstate__(self):
 
     def __setstate__(self, state):
         r"""
-        Takes a provided ``state`` and retrieves ``PowerSGDState``.
+        Take a provided ``state`` and set to this ``PowerSGDState`` instance.
+
         ``process_group`` is set to default.
         """
         self.process_group = distributed_c10d._get_default_group()
@@ -292,6 +297,7 @@ def __setstate__(self, state):
             setattr(self, slot, value)
 
     def maybe_increase_iter(self, bucket):
+        """Track iterations and trigger log message at start of local SGD."""
         # Since bucket 0 is the last bucket to allreduce in an iteration.
         # Only increase `iter` when bucket 0 is processed.
         if bucket.is_last():
@@ -304,7 +310,9 @@ def maybe_increase_iter(self, bucket):
 
     def compression_stats(self):
         r"""
-        Returns the latest compression statistics as a tuple of the form (compress_rate, numel_before_compression, numel_after_compression), where:
+        Return latest compression statistics as tuple.
+
+        Returns tuple of form (compress_rate, numel_before_compression, numel_after_compression) where:
 
         compress_rate is the effective compression rate i.e. (number of elements before compression) / (number of elements after compression);
 
@@ -328,6 +336,8 @@ def powerSGD_hook(
     state: PowerSGDState, bucket: dist.GradBucket
 ) -> torch.futures.Future[torch.Tensor]:
     r"""
+    Implement PowerSGD algorithm.
+
     This DDP communication hook implements PowerSGD gradient compression
     algorithm described in the `paper <https://arxiv.org/abs/1905.13727>`_.
     Once gradient tensors are aggregated across all workers, this hook applies
@@ -636,6 +646,8 @@ def batched_powerSGD_hook(
     state: PowerSGDState, bucket: dist.GradBucket
 ) -> torch.futures.Future[torch.Tensor]:
     r"""
+    Implement simplified PowerSGD algorithm.
+
     This DDP communication hook implements a simplified PowerSGD gradient compression
     algorithm described in the `paper <https://arxiv.org/abs/1905.13727>`_.
     This variant does not compress the gradients layer by layer,
@@ -750,7 +762,7 @@ def batched_powerSGD_hook(
             )
 
         def create_low_rank_tensor(fill_random_values, rng):
-            "Returns a low-rank 2D tensor of square_side_length * matrix_approximation_rank."
+            """Return a low-rank 2D tensor of square_side_length * matrix_approximation_rank."""
             if fill_random_values:
                 with torch.random.fork_rng(devices=[]):
                     # Fork this RNG to avoid changing the seed globally and affecting the random sampling
diff --git a/torch/distributed/algorithms/ddp_comm_hooks/quantization_hooks.py b/torch/distributed/algorithms/ddp_comm_hooks/quantization_hooks.py