Implemented FIRE positional encoding module, added

kaddu341 · kaddu341 · commit a06aa04ce5e6 · 2025-02-12T13:19:44.000-05:00
test, updated docs.
diff --git a/docs/source/api_ref_modules.rst b/docs/source/api_ref_modules.rst
@@ -25,6 +25,7 @@ Modeling Components and Building Blocks
     VisionTransformer
     LayerDropout
     prepare_layer_dropout
+    FireSelfAttention
 
 Losses
 ------
diff --git a/tests/torchtune/modules/test_position_embeddings.py b/tests/torchtune/modules/test_position_embeddings.py
@@ -11,6 +11,7 @@
 from torch import tensor
 
 from torchtune.modules.position_embeddings import (
+    FireSelfAttention,
     RotaryPositionalEmbeddings,
     VisionRotaryPositionalEmbeddings,
 )
@@ -198,3 +199,32 @@ def test_rope_init_meta_device(self, input_params):
         meta_rope.rope_init()
         for p1, p2 in zip(rope_on_device.buffers(), meta_rope.buffers()):
             torch.testing.assert_close(p1, p2)
+
+
+class TestFireSelfAttention:
+    """
+    Class for testing FIRE positional embeddings module. As far as I am aware,
+    there is not an open-source reference implementation available,
+    besides the mathematical description of the algorithm in the paper (https://arxiv.org/abs/2310.04418),
+    and since the module contains learnable weights, one would have to actually train it
+    to evaluate performance (as others have done; e.g. see https://arxiv.org/abs/2402.09371).
+    For now, I am just testing the format and content of the output to ensure it can be used
+    safely within a transformer model without breaking anything.
+    """
+
+    # @mps_ignored_test()
+    def test_format(self):
+        # instantiate module
+        test_layer = FireSelfAttention(dim_model=512, num_heads=8, hidden_size=32)
+        # input tensor; FireSelfAttention expects a format of (batch_size, seq_len, dim_model)
+        x = torch.randn(64, 20, 512)
+        # get output
+        y = test_layer(x)
+
+        # validate output
+        # make sure it has the right shape
+        assert y.shape == x.shape
+
+        """The input tensor was not all zeros, so if the module is working properly (even
+        though it hasn't been trained yet) the output should be different from the input."""
+        assert not torch.equal(y, x)
diff --git a/torchtune/modules/__init__.py b/torchtune/modules/__init__.py
@@ -18,6 +18,7 @@
 from .layer_norm import Fp32LayerNorm  # noqa
 from .low_precision import FrozenNF4Linear  # noqa
 from .position_embeddings import (  # noqa
+    FireSelfAttention,
     RotaryPositionalEmbeddings,
     VisionRotaryPositionalEmbeddings,
 )
@@ -57,4 +58,5 @@
     "disable_kv_cache",
     "LayerDropout",
     "prepare_layer_dropout",
+    "FireSelfAttention",
 ]
diff --git a/torchtune/modules/position_embeddings.py b/torchtune/modules/position_embeddings.py
@@ -268,3 +268,149 @@ def forward(
         # Squash tile dimension back into sequence dimension - tensor has shape [b, s, n_h, h_d]
         x_out = x_out.reshape(bsz, self.max_num_tiles * seq_len, n_h, h_d)
         return x_out.type_as(x)
+
+
+class FireSelfAttention(nn.Module):
+    """
+    This class implements FIRE (Functional Interpolation for Relative Positional Encodings)
+    as described in https://arxiv.org/abs/2310.04418 for causal language modeling tasks. The
+    only modification from the paper is that this implementation uses the GELU activation function instead
+    of ReLU in order to avoid possible problems with "dying" neurons.
+
+    Args:
+        dim_model (int): The embedding dimension of the input vectors.
+        num_heads (int): The number of self-attention heads, set to 1 by default. The dimension of each individual head
+            is usually computed as ``dim_model // num_heads``.
+        hidden_size (int): The dimension of the MLP layers in each attention head used to compute the bias matrix.
+
+    Note: This module is fundamentally a positional encoding scheme; however, due to the nature of FIRE relative
+        positional encodings, it takes the form of an attention layer.
+    """
+
+    def __init__(
+        self, dim_model: int, num_heads: int = 1, hidden_size: int = 32
+    ) -> None:
+        super().__init__()
+
+        # make sure num_heads divides dim_model:
+        assert (
+            dim_model % num_heads == 0
+        ), "Number of heads must divide dimension of model"
+
+        # compute kdim = vdim
+        kdim = dim_model // num_heads
+
+        # initialize attention heads
+        self.attention_heads = nn.ModuleList(
+            [
+                self.FireAttentionHead(dim_model, kdim, hidden_size)
+                for _ in range(num_heads)
+            ]
+        )
+
+        # final linear layer
+        self.W_o = nn.Linear(dim_model, dim_model, bias=False)
+
+    class FireAttentionHead(nn.Module):
+        """
+        An inner class to implement a single attention head using the FIRE positional encoding scheme.
+        **Do not** use this class directly; instead use FireSelfAttention with ``num_heads = 1`` if you need it.
+
+        Args:
+            dim_model (int): The embedding dimension of the input vectors, as above.
+            kdim (int): The dimension of the query, key, and value vectors, computed as ``kdim = dim_model // num_heads``.
+            hidden_size (int): The dimension of the MLP layers in each attention head used to compute the bias matrix.
+        """
+
+        def __init__(self, dim_model: int, kdim: int, hidden_size: int) -> None:
+            super().__init__()
+            self.kdim = kdim
+
+            # initialize parameter matrices
+            self.W_q = nn.Linear(dim_model, kdim, bias=False)
+            self.W_k = nn.Linear(dim_model, kdim, bias=False)
+            self.W_v = nn.Linear(dim_model, kdim, bias=False)
+
+            # initialize learnable scalars to "reasonable" values (these are arbitary and can be adjusted later on.)
+            # c is used to modify the input of the logarithm in the phi function.
+            self.c = nn.Parameter(torch.tensor(1.0))
+            # L is used in the adaptive thresholding mechanism to activate progressive interpolation only for long contexts.
+            self.L = nn.Parameter(torch.tensor(2.0))
+
+            # initialize learnable continuous function
+            self.f_theta = nn.Sequential(
+                nn.Linear(1, hidden_size),
+                nn.GELU(),
+                nn.Linear(hidden_size, hidden_size),
+                nn.GELU(),
+                nn.Linear(hidden_size, 1),
+            )
+
+        # concave function to amplify differences among local positions
+        def phi(self, c: nn.Parameter, x: int | torch.Tensor) -> torch.Tensor:
+            return torch.log1p(c * x)
+
+        def forward(self, src: torch.Tensor) -> torch.Tensor:
+            """
+            Args:
+                src (torch.Tensor): Input tensor with shape ``[batch_size, seq_length, dim_model]``
+
+            Returns:
+                torch.Tensor: Output tensor of shape ``[batch_size, seq_length, kdim]``
+            """
+            # Assuming src has shape (batch_size, seq_length, dim_model)
+            batch_size, seq_length = src.shape[0:2]
+
+            # constrain c to be > 0
+            c = torch.nn.functional.softplus(self.c)
+
+            # compute bias matrix
+            # below, i is the query position and j is the key position, 0 <= i - j < i
+            bias = torch.zeros(seq_length, seq_length)
+            for i in range(1, seq_length):
+                for j in range(0, i):
+                    # we have to use i + 1 in the denominator to compensate for 0-based indexing
+                    bias[i, j] = self.phi(c, i - j) / self.phi(
+                        c, torch.maximum(self.L, torch.tensor(i + 1))
+                    )
+            # apply MLP to bias matrix
+            bias = self.f_theta(bias.unsqueeze(2)).squeeze(2)
+            # add causal mask
+            lookahead_mask = torch.ones(seq_length, seq_length, dtype=torch.bool).triu(
+                diagonal=1
+            )
+            bias.masked_fill_(lookahead_mask, float("-inf"))
+            # repeat bias matrix for batch_size
+            bias = bias.repeat(batch_size, 1, 1)
+
+            # get Query, Key, and Value matrices for each sequence
+            q = self.W_q(src)
+            k = self.W_k(src)
+            v = self.W_v(src)
+
+            # calculate attention scores
+            k_t = torch.transpose(k, 1, 2)
+            attn_logits = torch.bmm(q, k_t) / (self.kdim**0.5)
+            attn_logits = attn_logits + bias
+            attn_weights = torch.nn.functional.softmax(attn_logits, dim=-1)
+            attn_outputs = torch.bmm(attn_weights, v)
+            return attn_outputs
+
+    # End of the inner class for a single attention head
+
+    def forward(self, src: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            src (torch.Tensor): Input tensor with shape ``[batch_size, seq_length, dim_model]``
+
+        Returns:
+            torch.Tensor: Output tensor of shape ``[batch_size, seq_length, dim_model]`` with multi-head attention
+            and FIRE relative positional encoding applied.
+        """
+        # src should have shape (batch_size, seq_length, dim_model)
+        # Pass src through the attention heads
+        attn_results = [attn_head(src) for attn_head in self.attention_heads]
+        # concatenate results
+        attn_results = torch.cat(attn_results, dim=-1)
+        # pass through final linear layer
+        return self.W_o(attn_results)

Original file line number	Diff line number	Diff line change
`@@ -18,6 +18,7 @@`
`18`	`18`	`from .layer_norm import Fp32LayerNorm # noqa`
`19`	`19`	`from .low_precision import FrozenNF4Linear # noqa`
`20`	`20`	`from .position_embeddings import ( # noqa`
	`21`	`+ FireSelfAttention,`
`21`	`22`	`RotaryPositionalEmbeddings,`
`22`	`23`	`VisionRotaryPositionalEmbeddings,`
`23`	`24`	`)`
`@@ -57,4 +58,5 @@`
`57`	`58`	`"disable_kv_cache",`
`58`	`59`	`"LayerDropout",`
`59`	`60`	`"prepare_layer_dropout",`
	`61`	`+ "FireSelfAttention",`
`60`	`62`	`]`