Add YaRN

Downtown-Case · web-flow · commit 0d78f034b1cc · 2024-09-27T18:53:22.000-04:00
diff --git a/exllamav2/device.py b/exllamav2/device.py
@@ -182,6 +182,59 @@ def prepare_sincos(self):
                 cfg.l3_rope_original_max_position_embeddings,
             )
 
+        # YaRN
+        # Adapted from transformers: https://github.com/huggingface/transformers/blob/2e24ee4dfa39cc0bc264b89edbccc373c8337086/src/transformers/modeling_rope_utils.py#L163
+        
+        elif cfg.alt_rope_method == "yarn":
+
+            partial_rotary_factor = 1.0 # Placeholder, assume no partial_rotary_factor in config.
+            dim = int(head_dim * partial_rotary_factor)
+            yarn_max_position_embeddings = cfg.yarn_rope_original_max_position_embeddings
+            factor = cfg.yarn_rope_factor
+
+            # Sets the attention factor as suggested in the paper
+            # See: https://github.com/huggingface/transformers/blob/main/examples/modular-transformers/modeling_super.py#L190-L191
+            scaling_factor = 0.1 * math.log(factor) + 1.0 
+
+            # Optional config options
+            # beta_fast/beta_slow: as suggested in the paper, default to 32/1 (correspondingly)
+            beta_fast = 32
+            beta_slow = 1
+
+            # Compute the inverse frequencies
+            def find_correction_dim(num_rotations, dim, base, yarn_max_position_embeddings):
+                """Inverse dimension formula to find the dimension based on the number of rotations"""
+                return (dim * math.log(yarn_max_position_embeddings / (num_rotations * 2 * math.pi))) / (2 * math.log(base))
+
+            def find_correction_range(low_rot, high_rot, dim, base, yarn_max_position_embeddings):
+                """Find dimension range bounds based on rotations"""
+                low = math.floor(find_correction_dim(low_rot, dim, base, yarn_max_position_embeddings))
+                high = math.ceil(find_correction_dim(high_rot, dim, base, yarn_max_position_embeddings))
+                return max(low, 0), min(high, dim - 1)
+
+            def linear_ramp_factor(min, max, dim):
+                if min == max:
+                    max += 0.001  # Prevent singularity
+
+                linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min)
+                ramp_func = torch.clamp(linear_func, 0, 1)
+                return ramp_func
+
+            # Note on variable naming: "interpolation" comes from the original technique, where we interpolate the position IDs
+            # to expand the possible context length. In other words, interpolation = apply scaling factor.
+            pos_freqs = base ** (torch.arange(0, dim, 2).float().to(device) / dim)
+            inv_freq_extrapolation = 1.0 / pos_freqs
+            inv_freq_interpolation = 1.0 / (factor * pos_freqs)
+
+            low, high = find_correction_range(beta_fast, beta_slow, dim, base, yarn_max_position_embeddings)
+
+            # Get n-dimensional rotational scaling corrected for extrapolation
+            inv_freq_extrapolation_factor = 1 - linear_ramp_factor(low, high, dim // 2).float().to(device)
+            inv_freq = (
+                inv_freq_interpolation * (1 - inv_freq_extrapolation_factor)
+                + inv_freq_extrapolation * inv_freq_extrapolation_factor
+            )
+
         # Regular
 
         else: