Add YaRN factor override to model_init

turboderp · turboderp · commit 5d4359317db2 · 2024-09-29T12:35:45.000+02:00
diff --git a/exllamav2/config.py b/exllamav2/config.py
@@ -287,31 +287,30 @@ def prepare(self, no_tensors: bool = False):
         rs = read(read_config, dict, "rope_scaling", None)
         if rs:
             scaling_type = rs.get("type", None)
+            rope_type = rs.get("rope_type", None)
+            assert not (scaling_type and rope_type), "rope_scaling key has both `type` and `rope_type` subkeys"
             if scaling_type == "linear":
                 assert "factor" in rs, "'factor' missing from 'rope_scaling' config"
                 self.scale_pos_emb = rs.get("factor", 1.0)
             if scaling_type == "su" or scaling_type == "longrope":
-                assert "long_factor" in rs, "'long_factor' missing from 'rope_scaling' config"
-                assert "short_factor" in rs, "'short_factor' missing from 'rope_scaling' config"
+                assert "long_factor" in rs, "'long_factor' missing from 'rope_scaling' config ('su' mode)"
+                assert "short_factor" in rs, "'short_factor' missing from 'rope_scaling' config ('su' mode)"
                 assert "original_max_position_embeddings" in read_config, \
                     "'original_max_position_embeddings' required for 'su' scaling"
                 self.scale_long_factor = rs["long_factor"]
                 self.scale_short_factor = rs["short_factor"]
                 self.original_max_seq_len = read_config["original_max_position_embeddings"]
                 self.alt_rope_method = "su"
-            # if scaling_type == "yarn":
-            #     self.scale_alpha_value = factor
-            rope_type = rs.get("rope_type", None)
+            if scaling_type == "yarn":
+                self.alt_rope_method = "yarn"
+                self.yarn_rope_factor = rs["factor"]
+                self.yarn_rope_original_max_position_embeddings = rs["original_max_position_embeddings"]
             if rope_type == "llama3":
                 self.alt_rope_method = "llama3"
                 self.l3_rope_factor = rs["factor"]
                 self.l3_rope_low_freq_factor = rs["low_freq_factor"]
                 self.l3_rope_high_freq_factor = rs["high_freq_factor"]
                 self.l3_rope_original_max_position_embeddings = rs["original_max_position_embeddings"]
-            if scaling_type == "yarn":
-                self.alt_rope_method = "yarn"
-                self.yarn_rope_factor = rs["factor"]
-                self.yarn_rope_original_max_position_embeddings = rs["original_max_position_embeddings"]
 
         # Checkpoint format (for GPTQ models)
 
diff --git a/exllamav2/model_init.py b/exllamav2/model_init.py
@@ -15,6 +15,7 @@ def add_args(parser):
     parser.add_argument("-l", "--length", type = int, help = "Maximum sequence length")
     parser.add_argument("-rs", "--rope_scale", type = float, help = "RoPE scaling factor")
     parser.add_argument("-ra", "--rope_alpha", type = float, help = "RoPE alpha value (NTK)")
+    parser.add_argument("-ry", "--rope_yarn", type = float, help = "Set RoPE YaRN factor (use default max_seq_len as original_max_position_embeddings if not configured)")
     parser.add_argument("-nfa", "--no_flash_attn", action = "store_true", help = "Disable Flash Attention")
     parser.add_argument("-nxf", "--no_xformers", action = "store_true", help = "Disable xformers, an alternative plan of flash attn for older devices")
     parser.add_argument("-nsdpa", "--no_sdpa", action = "store_true", help = "Disable Torch SDPA")
@@ -27,7 +28,6 @@ def add_args(parser):
     parser.add_argument("-chunk", "--chunk_size", type = int, help = "Chunk size ('input length')")
 
 
-
 def print_options(args):
 
     print(f" -- Model: {args.model_dir}")
@@ -38,6 +38,7 @@ def print_options(args):
     if args.length is not None: print_opts += [f"length: {args.length}"]
     if args.rope_scale is not None: print_opts += [f"rope_scale: {args.rope_scale}"]
     if args.rope_alpha is not None: print_opts += [f"rope_alpha: {args.rope_alpha}"]
+    if args.rope_yarn is not None: print_opts += [f"rope_yarn: {args.rope_yarn}"]
     if args.no_flash_attn: print_opts += ["no_flash_attn"]
     if args.no_xformers: print_opts += ["no_xformers"]
     if args.no_sdpa: print_opts += ["no_sdpa"]
@@ -97,6 +98,12 @@ def init(
 
     # Set config options
 
+    if args.rope_yarn:
+        if config.alt_rope_method != "yarn":
+            config.yarn_rope_original_max_position_embeddings = config.max_seq_len
+        config.alt_rope_method = "yarn"
+        config.yarn_rope_factor = args.rope_yarn
+
     if args.length: config.max_seq_len = args.length
     if args.rope_scale: config.scale_pos_emb = args.rope_scale
     if args.rope_alpha: config.scale_alpha_value = args.rope_alpha