Correct padding and activations docstrings in nn module

ZhuBaohe · facebook-github-bot · commit e81878e0a944 · 2019-02-19T08:16:52.000-08:00
Summary: Pull Request resolved: pytorch#17197 Differential Revision: D14131284 Pulled By: soumith fbshipit-source-id: 6edd225b47b1dde81b5ad0a23c588c6621987a69
diff --git a/torch/nn/functional.py b/torch/nn/functional.py
@@ -953,16 +953,16 @@ def glu(input, dim=-1):
     The gated linear unit. Computes:
 
     .. math ::
+        \text{GLU}(a, b) = a \otimes \sigma(b)
 
-        H = A \times \sigma(B)
-
-    where `input` is split in half along `dim` to form `A` and `B`.
+    where `input` is split in half along `dim` to form `a` and `b`, :math:`\sigma`
+    is the sigmoid function and :math:`\otimes` is the element-wise product between matrices.
 
     See `Language Modeling with Gated Convolutional Networks <https://arxiv.org/abs/1612.08083>`_.
 
     Args:
         input (Tensor): input tensor
-        dim (int): dimension on which to split the input
+        dim (int): dimension on which to split the input. Default: -1
     """
     if input.dim() == 0:
         raise RuntimeError("glu does not suppport scalars because halving size must be even")
@@ -1139,7 +1139,7 @@ def rrelu(input, lower=1. / 8, upper=1. / 3, training=False, inplace=False):
 logsigmoid = _add_docstr(torch._C._nn.log_sigmoid, r"""
 logsigmoid(input) -> Tensor
 
-Applies element-wise :math:`\text{LogSigmoid}(x) = \log \left(\frac{1}{1 + \exp(-x_i)}\right)`
+Applies element-wise :math:`\text{LogSigmoid}(x_i) = \log \left(\frac{1}{1 + \exp(-x_i)}\right)`
 
 See :class:`~torch.nn.LogSigmoid` for more details.
 """)
@@ -1211,8 +1211,8 @@ def softmin(input, dim=None, _stacklevel=3, dtype=None):
         dim (int): A dimension along which softmin will be computed (so every slice
             along dim will sum to 1).
         dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
-        If specified, the input tensor is casted to :attr:`dtype` before the operation
-        is performed. This is useful for preventing data type overflows. Default: None.
+          If specified, the input tensor is casted to :attr:`dtype` before the operation
+          is performed. This is useful for preventing data type overflows. Default: None.
     """
     if dim is None:
         dim = _get_softmax_dim('softmin', input.dim(), _stacklevel)
@@ -1233,17 +1233,16 @@ def softmax(input, dim=None, _stacklevel=3, dtype=None):
     :math:`\text{Softmax}(x_{i}) = \frac{exp(x_i)}{\sum_j exp(x_j)}`
 
     It is applied to all slices along dim, and will re-scale them so that the elements
-    lie in the range `(0, 1)` and sum to 1.
+    lie in the range `[0, 1]` and sum to 1.
 
     See :class:`~torch.nn.Softmax` for more details.
 
     Arguments:
         input (Tensor): input
         dim (int): A dimension along which softmax will be computed.
         dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
-        If specified, the input tensor is casted to :attr:`dtype` before the operation
-        is performed. This is useful for preventing data type overflows. Default: None.
-
+          If specified, the input tensor is casted to :attr:`dtype` before the operation
+          is performed. This is useful for preventing data type overflows. Default: None.
 
     .. note::
         This function doesn't work directly with NLLLoss,
@@ -1335,8 +1334,8 @@ def log_softmax(input, dim=None, _stacklevel=3, dtype=None):
         input (Tensor): input
         dim (int): A dimension along which log_softmax will be computed.
         dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
-        If specified, the input tensor is casted to :attr:`dtype` before the operation
-        is performed. This is useful for preventing data type overflows. Default: None.
+          If specified, the input tensor is casted to :attr:`dtype` before the operation
+          is performed. This is useful for preventing data type overflows. Default: None.
     """
     if dim is None:
         dim = _get_softmax_dim('log_softmax', input.dim(), _stacklevel)
diff --git a/torch/nn/modules/activation.py b/torch/nn/modules/activation.py
@@ -9,7 +9,7 @@
 
 @weak_module
 class Threshold(Module):
-    r"""Thresholds each element of the input Tensor
+    r"""Thresholds each element of the input Tensor.
 
     Threshold is defined as:
 
@@ -58,10 +58,9 @@ def extra_repr(self):
 
 @weak_module
 class ReLU(Threshold):
-    r"""Applies the rectified linear unit function element-wise
-    :math:`\text{ReLU}(x)= \max(0, x)`
+    r"""Applies the rectified linear unit function element-wise:
 
-    .. image:: scripts/activation_images/ReLU.png
+    :math:`\text{ReLU}(x)= \max(0, x)`
 
     Args:
         inplace: can optionally do the operation in-place. Default: ``False``
@@ -71,6 +70,8 @@ class ReLU(Threshold):
           dimensions
         - Output: :math:`(N, *)`, same shape as the input
 
+    .. image:: scripts/activation_images/ReLU.png
+
     Examples::
 
         >>> m = nn.ReLU()
@@ -166,8 +167,6 @@ class Hardtanh(Module):
     The range of the linear region :math:`[-1, 1]` can be adjusted using
     :attr:`min_val` and :attr:`max_val`.
 
-    .. image:: scripts/activation_images/Hardtanh.png
-
     Args:
         min_val: minimum value of the linear region range. Default: -1
         max_val: maximum value of the linear region range. Default: 1
@@ -181,6 +180,8 @@ class Hardtanh(Module):
           dimensions
         - Output: :math:`(N, *)`, same shape as the input
 
+    .. image:: scripts/activation_images/Hardtanh.png
+
     Examples::
 
         >>> m = nn.Hardtanh(-2, 2)
@@ -394,8 +395,6 @@ class SELU(Module):
     with :math:`\alpha = 1.6732632423543772848170429916717` and
     :math:`\text{scale} = 1.0507009873554804934193349852946`.
 
-    .. image:: scripts/activation_images/SELU.png
-
     More details can be found in the paper `Self-Normalizing Neural Networks`_ .
 
     Args:
@@ -406,6 +405,8 @@ class SELU(Module):
           dimensions
         - Output: :math:`(N, *)`, same shape as the input
 
+    .. image:: scripts/activation_images/SELU.png
+
     Examples::
 
         >>> m = nn.SELU()
@@ -433,7 +434,7 @@ def extra_repr(self):
 class GLU(Module):
     r"""Applies the gated linear unit function
     :math:`{GLU}(a, b)= a \otimes \sigma(b)` where :math:`a` is the first half
-    of the input vector and :math:`b` is the second half.
+    of the input matrices and :math:`b` is the second half.
 
     Args:
         dim (int): the dimension on which to split the input. Default: -1
@@ -708,8 +709,7 @@ class PReLU(Module):
         - Output: :math:`(N, *)`, same shape as the input
 
     Attributes:
-        weight (Tensor): the learnable weights of shape (attr:`num_parameters`).
-            The attr:`dtype` is default to
+        weight (Tensor): the learnable weights of shape (:attr:`num_parameters`).
 
     .. image:: scripts/activation_images/PReLU.png
 
@@ -789,7 +789,9 @@ def forward(self, input):
 class Softmin(Module):
     r"""Applies the Softmin function to an n-dimensional input Tensor
     rescaling them so that the elements of the n-dimensional output Tensor
-    lie in the range `(0, 1)` and sum to 1
+    lie in the range `[0, 1]` and sum to 1.
+
+    Softmin is defined as:
 
     .. math::
         \text{Softmin}(x_{i}) = \frac{\exp(-x_i)}{\sum_j \exp(-x_j)}
@@ -828,7 +830,7 @@ def forward(self, input):
 class Softmax(Module):
     r"""Applies the Softmax function to an n-dimensional input Tensor
     rescaling them so that the elements of the n-dimensional output Tensor
-    lie in the range (0,1) and sum to 1
+    lie in the range [0,1] and sum to 1.
 
     Softmax is defined as:
 
@@ -918,8 +920,7 @@ class LogSoftmax(Module):
         - Output: :math:`(*)`, same shape as the input
 
     Arguments:
-        dim (int): A dimension along which Softmax will be computed (so every slice
-            along dim will sum to 1).
+        dim (int): A dimension along which LogSoftmax will be computed.
 
     Returns:
         a Tensor of the same dimension and shape as the input with
diff --git a/torch/nn/modules/adaptive.py b/torch/nn/modules/adaptive.py
@@ -45,7 +45,7 @@ class AdaptiveLogSoftmaxWithLoss(Module):
       assigned to the first cluster, and targets `101, 102, ..., 1000` will be
       assigned to the second cluster, while targets
       `1001, 1002, ..., n_classes - 1` will be assigned
-      to the last, third cluster
+      to the last, third cluster.
 
     * :attr:`div_value` is used to compute the size of each additional cluster,
       which is given as
@@ -74,10 +74,12 @@ class AdaptiveLogSoftmaxWithLoss(Module):
 
     Args:
         in_features (int): Number of features in the input tensor
-        n_classes (int): Number of classes in the dataset.
-        cutoffs (Sequence): Cutoffs used to assign targets to their buckets.
+        n_classes (int): Number of classes in the dataset
+        cutoffs (Sequence): Cutoffs used to assign targets to their buckets
         div_value (float, optional): value used as an exponent to compute sizes
             of the clusters. Default: 4.0
+        head_bias (bool, optional): If ``True``, adds a bias term to the ‘head’ of the
+            adaptive softmax. Default: ``False``
 
     Returns:
         ``NamedTuple`` with ``output`` and ``loss`` fields:
diff --git a/torch/nn/modules/padding.py b/torch/nn/modules/padding.py
@@ -37,6 +37,7 @@ class ConstantPad1d(_ConstantPadNd):
     Shape:
         - Input: :math:`(N, C, W_{in})`
         - Output: :math:`(N, C, W_{out})` where
+
           :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
 
     Examples::
@@ -86,7 +87,9 @@ class ConstantPad2d(_ConstantPadNd):
     Shape:
         - Input: :math:`(N, C, H_{in}, W_{in})`
         - Output: :math:`(N, C, H_{out}, W_{out})` where
+
           :math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}`
+
           :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
 
     Examples::
@@ -97,13 +100,6 @@ class ConstantPad2d(_ConstantPadNd):
         tensor([[[ 1.6585,  0.4320],
                  [-0.8701, -0.4649]]])
         >>> m(input)
-        tensor([[[ 3.5000,  3.5000,  3.5000,  3.5000,  3.5000,  3.5000],
-                 [ 3.5000,  3.5000,  3.5000,  3.5000,  3.5000,  3.5000],
-                 [ 3.5000,  3.5000,  1.6585,  0.4320,  3.5000,  3.5000],
-                 [ 3.5000,  3.5000, -0.8701, -0.4649,  3.5000,  3.5000],
-                 [ 3.5000,  3.5000,  3.5000,  3.5000,  3.5000,  3.5000],
-                 [ 3.5000,  3.5000,  3.5000,  3.5000,  3.5000,  3.5000]]])
-        >>> m(input)
         tensor([[[ 3.5000,  3.5000,  3.5000,  3.5000,  3.5000,  3.5000],
                  [ 3.5000,  3.5000,  3.5000,  3.5000,  3.5000,  3.5000],
                  [ 3.5000,  3.5000,  1.6585,  0.4320,  3.5000,  3.5000],
@@ -143,8 +139,11 @@ class ConstantPad3d(_ConstantPadNd):
     Shape:
         - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})`
         - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` where
+
           :math:`D_{out} = D_{in} + \text{padding\_front} + \text{padding\_back}`
+
           :math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}`
+
           :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
 
     Examples::
@@ -189,6 +188,7 @@ class ReflectionPad1d(_ReflectionPadNd):
     Shape:
         - Input: :math:`(N, C, W_{in})`
         - Output: :math:`(N, C, W_{out})` where
+
           :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
 
     Examples::
@@ -199,9 +199,6 @@ class ReflectionPad1d(_ReflectionPadNd):
         tensor([[[0., 1., 2., 3.],
                  [4., 5., 6., 7.]]])
         >>> m(input)
-        tensor([[[2., 1., 0., 1., 2., 3., 2., 1.],
-                 [6., 5., 4., 5., 6., 7., 6., 5.]]])
-        >>> m(input)
         tensor([[[2., 1., 0., 1., 2., 3., 2., 1.],
                  [6., 5., 4., 5., 6., 7., 6., 5.]]])
         >>> # using different paddings for different sides
@@ -233,6 +230,7 @@ class ReflectionPad2d(_ReflectionPadNd):
         - Output: :math:`(N, C, H_{out}, W_{out})` where
 
           :math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}`
+
           :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
 
     Examples::
@@ -293,6 +291,7 @@ class ReplicationPad1d(_ReplicationPadNd):
     Shape:
         - Input: :math:`(N, C, W_{in})`
         - Output: :math:`(N, C, W_{out})` where
+
           :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
 
     Examples::
@@ -332,7 +331,9 @@ class ReplicationPad2d(_ReplicationPadNd):
     Shape:
         - Input: :math:`(N, C, H_{in}, W_{in})`
         - Output: :math:`(N, C, H_{out}, W_{out})` where
+
           :math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}`
+
           :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
 
     Examples::
@@ -383,8 +384,11 @@ class ReplicationPad3d(_ReplicationPadNd):
     Shape:
         - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})`
         - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` where
+
           :math:`D_{out} = D_{in} + \text{padding\_front} + \text{padding\_back}`
+
           :math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}`
+
           :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
 
     Examples::
@@ -417,7 +421,9 @@ class ZeroPad2d(ConstantPad2d):
     Shape:
         - Input: :math:`(N, C, H_{in}, W_{in})`
         - Output: :math:`(N, C, H_{out}, W_{out})` where
+
           :math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}`
+
           :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
 
     Examples::