Only trigger if long context config is set

Downtown-Case · web-flow · commit 8dca1abf44c5 · 2024-09-27T19:44:49.000-04:00
diff --git a/exllamav2/device.py b/exllamav2/device.py
@@ -187,53 +187,60 @@ def prepare_sincos(self):
         
         elif cfg.alt_rope_method == "yarn":
 
-            partial_rotary_factor = 1.0 # Placeholder, assume no partial_rotary_factor in config.
-            dim = int(head_dim * partial_rotary_factor)
             yarn_max_position_embeddings = cfg.yarn_rope_original_max_position_embeddings
-            factor = cfg.yarn_rope_factor
-
-            # Sets the attention factor as suggested in the paper
-            # See: https://github.com/huggingface/transformers/blob/main/examples/modular-transformers/modeling_super.py#L190-L191
-            scaling_factor = 0.1 * math.log(factor) + 1.0 
-
-            # Optional config options
-            # beta_fast/beta_slow: as suggested in the paper, default to 32/1 (correspondingly)
-            beta_fast = 32
-            beta_slow = 1
-
-            # Compute the inverse frequencies
-            def find_correction_dim(num_rotations, dim, base, yarn_max_position_embeddings):
-                """Inverse dimension formula to find the dimension based on the number of rotations"""
-                return (dim * math.log(yarn_max_position_embeddings / (num_rotations * 2 * math.pi))) / (2 * math.log(base))
-
-            def find_correction_range(low_rot, high_rot, dim, base, yarn_max_position_embeddings):
-                """Find dimension range bounds based on rotations"""
-                low = math.floor(find_correction_dim(low_rot, dim, base, yarn_max_position_embeddings))
-                high = math.ceil(find_correction_dim(high_rot, dim, base, yarn_max_position_embeddings))
-                return max(low, 0), min(high, dim - 1)
-
-            def linear_ramp_factor(min, max, dim):
-                if min == max:
-                    max += 0.001  # Prevent singularity
-
-                linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min)
-                ramp_func = torch.clamp(linear_func, 0, 1)
-                return ramp_func
-
-            # Note on variable naming: "interpolation" comes from the original technique, where we interpolate the position IDs
-            # to expand the possible context length. In other words, interpolation = apply scaling factor.
-            pos_freqs = base ** (torch.arange(0, dim, 2).float().to(device) / dim)
-            inv_freq_extrapolation = 1.0 / pos_freqs
-            inv_freq_interpolation = 1.0 / (factor * pos_freqs)
-
-            low, high = find_correction_range(beta_fast, beta_slow, dim, base, yarn_max_position_embeddings)
-
-            # Get n-dimensional rotational scaling corrected for extrapolation
-            inv_freq_extrapolation_factor = 1 - linear_ramp_factor(low, high, dim // 2).float().to(device)
-            inv_freq = (
-                inv_freq_interpolation * (1 - inv_freq_extrapolation_factor)
-                + inv_freq_extrapolation * inv_freq_extrapolation_factor
-            )
+
+            # Only activate if longer than original ctx
+            if cfg.max_seq_len > cfg.yarn_rope_original_max_position_embeddings:
+                
+                partial_rotary_factor = 1.0 # Placeholder, assume no partial_rotary_factor in config.
+                dim = int(head_dim * partial_rotary_factor)
+
+                factor = cfg.yarn_rope_factor
+
+                # Sets the attention factor as suggested in the paper
+                # See: https://github.com/huggingface/transformers/blob/main/examples/modular-transformers/modeling_super.py#L190-L191
+                scaling_factor = 0.1 * math.log(factor) + 1.0 
+
+                # Optional config options
+                # beta_fast/beta_slow: as suggested in the paper, default to 32/1 (correspondingly)
+                beta_fast = 32
+                beta_slow = 1
+
+                # Compute the inverse frequencies
+                def find_correction_dim(num_rotations, dim, base, yarn_max_position_embeddings):
+                    """Inverse dimension formula to find the dimension based on the number of rotations"""
+                    return (dim * math.log(yarn_max_position_embeddings / (num_rotations * 2 * math.pi))) / (2 * math.log(base))
+
+                def find_correction_range(low_rot, high_rot, dim, base, yarn_max_position_embeddings):
+                    """Find dimension range bounds based on rotations"""
+                    low = math.floor(find_correction_dim(low_rot, dim, base, yarn_max_position_embeddings))
+                    high = math.ceil(find_correction_dim(high_rot, dim, base, yarn_max_position_embeddings))
+                    return max(low, 0), min(high, dim - 1)
+
+                def linear_ramp_factor(min, max, dim):
+                    if min == max:
+                        max += 0.001  # Prevent singularity
+
+                    linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min)
+                    ramp_func = torch.clamp(linear_func, 0, 1)
+                    return ramp_func
+
+                # Note on variable naming: "interpolation" comes from the original technique, where we interpolate the position IDs
+                # to expand the possible context length. In other words, interpolation = apply scaling factor.
+                pos_freqs = base ** (torch.arange(0, dim, 2).float().to(device) / dim)
+                inv_freq_extrapolation = 1.0 / pos_freqs
+                inv_freq_interpolation = 1.0 / (factor * pos_freqs)
+
+                low, high = find_correction_range(beta_fast, beta_slow, dim, base, yarn_max_position_embeddings)
+
+                # Get n-dimensional rotational scaling corrected for extrapolation
+                inv_freq_extrapolation_factor = 1 - linear_ramp_factor(low, high, dim // 2).float().to(device)
+                inv_freq = (
+                    inv_freq_interpolation * (1 - inv_freq_extrapolation_factor)
+                    + inv_freq_extrapolation * inv_freq_extrapolation_factor
+                )
+            else:
+                inv_freq = 1.0 / (base ** (torch.arange(0, head_dim, 2, device = device).float() / head_dim))
 
         # Regular