Merge branch 'refs/heads/dev-yarn' into dev

turboderp · turboderp · commit c84f5979c824 · 2024-09-29T12:21:22.000+02:00
diff --git a/exllamav2/config.py b/exllamav2/config.py
@@ -111,6 +111,8 @@ class ExLlamaV2Config:
     l3_rope_low_freq_factor: float | None
     l3_rope_high_freq_factor: float | None
     l3_rope_original_max_position_embeddings: int | None
+    yarn_rope_factor: float | None
+    yarn_rope_original_max_position_embeddings: int | None
     checkpoint_fused_mlp: bool
     checkpoint_offset_qzeros: bool
 
@@ -306,6 +308,10 @@ def prepare(self, no_tensors: bool = False):
                 self.l3_rope_low_freq_factor = rs["low_freq_factor"]
                 self.l3_rope_high_freq_factor = rs["high_freq_factor"]
                 self.l3_rope_original_max_position_embeddings = rs["original_max_position_embeddings"]
+            if scaling_type == "yarn":
+                self.alt_rope_method = "yarn"
+                self.yarn_rope_factor = rs["factor"]
+                self.yarn_rope_original_max_position_embeddings = rs["original_max_position_embeddings"]
 
         # Checkpoint format (for GPTQ models)
 
diff --git a/exllamav2/device.py b/exllamav2/device.py
@@ -182,6 +182,66 @@ def prepare_sincos(self):
                 cfg.l3_rope_original_max_position_embeddings,
             )
 
+        # YaRN
+        # Adapted from transformers: https://github.com/huggingface/transformers/blob/2e24ee4dfa39cc0bc264b89edbccc373c8337086/src/transformers/modeling_rope_utils.py#L163
+        
+        elif cfg.alt_rope_method == "yarn":
+
+            yarn_max_position_embeddings = cfg.max_seq_len
+
+            # Only activate if longer than original ctx
+            if cfg.max_seq_len > cfg.yarn_rope_original_max_position_embeddings:
+                
+                partial_rotary_factor = 1.0 # Placeholder, assume no partial_rotary_factor in config.
+                dim = int(head_dim * partial_rotary_factor)
+
+                factor = cfg.yarn_rope_factor
+
+                # Sets the attention factor as suggested in the paper
+                # See: https://github.com/huggingface/transformers/blob/main/examples/modular-transformers/modeling_super.py#L190-L191
+                scaling_factor = 0.1 * math.log(factor) + 1.0 
+
+                # Optional config options
+                # beta_fast/beta_slow: as suggested in the paper, default to 32/1 (correspondingly)
+                beta_fast = 32
+                beta_slow = 1
+
+                # Compute the inverse frequencies
+                def find_correction_dim(num_rotations, dim, base, yarn_max_position_embeddings):
+                    """Inverse dimension formula to find the dimension based on the number of rotations"""
+                    return (dim * math.log(yarn_max_position_embeddings / (num_rotations * 2 * math.pi))) / (2 * math.log(base))
+
+                def find_correction_range(low_rot, high_rot, dim, base, yarn_max_position_embeddings):
+                    """Find dimension range bounds based on rotations"""
+                    low = math.floor(find_correction_dim(low_rot, dim, base, yarn_max_position_embeddings))
+                    high = math.ceil(find_correction_dim(high_rot, dim, base, yarn_max_position_embeddings))
+                    return max(low, 0), min(high, dim - 1)
+
+                def linear_ramp_factor(min, max, dim):
+                    if min == max:
+                        max += 0.001  # Prevent singularity
+
+                    linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min)
+                    ramp_func = torch.clamp(linear_func, 0, 1)
+                    return ramp_func
+
+                # Note on variable naming: "interpolation" comes from the original technique, where we interpolate the position IDs
+                # to expand the possible context length. In other words, interpolation = apply scaling factor.
+                pos_freqs = base ** (torch.arange(0, dim, 2).float().to(device) / dim)
+                inv_freq_extrapolation = 1.0 / pos_freqs
+                inv_freq_interpolation = 1.0 / (factor * pos_freqs)
+
+                low, high = find_correction_range(beta_fast, beta_slow, dim, base, yarn_max_position_embeddings)
+
+                # Get n-dimensional rotational scaling corrected for extrapolation
+                inv_freq_extrapolation_factor = 1 - linear_ramp_factor(low, high, dim // 2).float().to(device)
+                inv_freq = (
+                    inv_freq_interpolation * (1 - inv_freq_extrapolation_factor)
+                    + inv_freq_extrapolation * inv_freq_extrapolation_factor
+                )
+            else:
+                inv_freq = 1.0 / (base ** (torch.arange(0, head_dim, 2, device = device).float() / head_dim))
+
         # Regular
 
         else: