Lint and fix qat/spin

jackzhxng · jackzhxng · commit 467568e86771 · 2025-03-30T04:29:10.000-07:00
diff --git a/examples/models/llama/eval_llama_lib.py b/examples/models/llama/eval_llama_lib.py
@@ -21,8 +21,6 @@
 from pytorch_tokenizers import get_tokenizer
 from pytorch_tokenizers.llama2c import Llama2cTokenizer as SentencePieceTokenizer
 from pytorch_tokenizers.tiktoken import TiktokenTokenizer as Tiktoken
-from torch.nn import CrossEntropyLoss
-from tqdm import tqdm
 
 from .evaluate.eager_eval import EagerEvalWrapper
 
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
@@ -1251,7 +1251,7 @@ def _load_llama_model(
             input_prune_map_path=input_prune_map_path,
             output_prune_map_path=output_prune_map_path,
             dtype=torch_dtype,
-            args=config,
+            config=config,
         )
     )
 
diff --git a/examples/models/llama/model.py b/examples/models/llama/model.py
@@ -53,7 +53,7 @@ def __init__(self, **kwargs):
         self.output_prune_map_path = kwargs.get("output_prune_map_path", None)
         self.max_seq_len = kwargs.get("max_seq_len", 128)
         self.max_context_len = kwargs.get("max_context_len", 128)
-        self.args = kwargs.get("args", None)
+        self.config = kwargs.get("config", None)
 
         assert (
             self.max_context_len >= self.max_seq_len
@@ -156,10 +156,10 @@ def __init__(self, **kwargs):
 
         if model_args.use_scaled_rope:
             # Older models don't have use_scaled_rope configuration
-            assert self.args.model not in ["llama2", "stories110m"]
+            assert self.config.model.name not in ["llama2", "stories110m"]
 
             # Llama3_2 and newer models in ExecuTorch repo should set larger scale factor
-            if self.args.model not in ["llama3", "llama3_1"]:
+            if self.config and self.config.model.name not in ["llama3", "llama3_1"]:
                 model_args.rope_scale_factor = 32
 
         if kwargs.get("verbose", False):
@@ -194,7 +194,7 @@ def __init__(self, **kwargs):
             self.model_ = Int8DynActInt4WeightQuantizer()._convert_for_runtime(
                 self.model_
             )
-        elif hasattr(self.args, "use_spin_quant") and self.args.use_spin_quant:
+        elif self.config and self.config.quantization.use_spin_quant:
             print("Using SPIN quantization.")
             self._transform_for_pre_quantization(checkpoint, model_args)
 
@@ -203,19 +203,19 @@ def __init__(self, **kwargs):
             )
 
             sanitize_checkpoint_from_pre_quantization(checkpoint)
-        elif hasattr(self.args, "use_qat") and self.args.use_qat:
+        elif self.config and self.config.quantization.use_qat:
             print("Using QAT quantization.")
             self._transform_for_pre_quantization(checkpoint, model_args)
-            if hasattr(self.args, "use_lora") and self.args.use_lora:
-                assert model_args.lora_args["rank"] == self.args.use_lora
+            if self.config and self.config.quantization.use_lora:
+                assert model_args.lora_args["rank"] == self.config.quantization.use_lora
                 from .source_transformation.lora import (
                     transform_linear_for_lora_after_quantization,
                 )
 
                 self.model_ = transform_linear_for_lora_after_quantization(
                     self.model_,
                     checkpoint,
-                    self.args.use_lora,
+                    self.config.quantization.use_lora,
                 )
 
             from .source_transformation.pre_quantization import (
@@ -224,16 +224,16 @@ def __init__(self, **kwargs):
 
             sanitize_checkpoint_from_pre_quantization(checkpoint)
 
-        if hasattr(self.args, "use_attention_sink") and self.args.use_attention_sink:
+        if self.config and self.config.misc.use_attention_sink:
             from .source_transformation.attention_sink import enable_attention_sink
 
-            attention_sink_params = self.args.use_attention_sink.split(",")
+            attention_sink_params = self.config.misc.use_attention_sink.split(",")
             assert len(attention_sink_params) == 3
             sink_size = int(attention_sink_params[0])
             window_size = int(attention_sink_params[1])
             eviction_batch_size = int(attention_sink_params[2])
 
-            assert self.args.max_context_length == sink_size + window_size
+            assert self.config.sequence.max_context_length == sink_size + window_size
 
             self.model_ = enable_attention_sink(
                 module=self.model_,
@@ -321,20 +321,24 @@ def get_example_inputs_kvcache_sdpa(self):
             )
 
     def _transform_for_pre_quantization(self, checkpoint, model_args):
-        assert hasattr(self.args, "preq_mode"), "preq_mode must be specified"
-        assert self.args.preq_mode in [
+        assert self.config
+        assert self.config.quantization.preq_mode, "preq_mode must be specified"
+        assert self.config.quantization.preq_mode in [
             "8da4w",
             "8da4w_output_8da8w",
-        ], f"Quantization mode {self.args.preq_mode} is not compatible with SpinQuant."
-        assert hasattr(
-            self.args, "preq_group_size"
+        ], f"Quantization mode {self.config.quantization.preq_mode} is not compatible with SpinQuant."
+        assert (
+            self.config.quantization.preq_group_size
         ), "preq_group_size must be specified"
-        assert hasattr(self.args, "dtype_override"), "dtype_override must be specified"
+        assert self.config.model.dtype_override, "dtype_override must be specified"
         from .source_transformation.pre_quantization import (
             transform_linear_for_pre_quantization,
         )
 
-        assert self.args.preq_group_size == model_args.quantization_args["group_size"]
+        assert (
+            self.config.quantization.preq_group_size
+            == model_args.quantization_args["group_size"]
+        )
 
         mapping = {
             "fp32": torch.float32,
@@ -343,28 +347,28 @@ def _transform_for_pre_quantization(self, checkpoint, model_args):
         }
 
         # Transform the output layer first if needed.
-        if self.args.preq_mode == "8da4w_output_8da8w":
+        if self.config.quantization.preq_mode == "8da4w_output_8da8w":
             from .source_transformation.pre_quantization import (
                 transform_output_linear_for_pre_quantization,
             )
 
             self.model_ = transform_output_linear_for_pre_quantization(
                 module=self.model_,
                 checkpoint=checkpoint,
-                dtype=mapping[self.args.dtype_override],
+                dtype=mapping[self.config.model.dtype_override],
             )
 
         self.model_ = transform_linear_for_pre_quantization(
             self.model_,
             checkpoint,
-            self.args.preq_group_size,
-            mapping[self.args.dtype_override],
+            self.config.quantization.preq_group_size,
+            mapping[self.config.model.dtype_override],
         )
 
         embedding_bit_width, embedding_group_size = None, None
-        if hasattr(self.args, "preq_embedding_quantize"):
+        if self.config.quantization.preq_embedding_quantize:
             embedding_bit_width, embedding_group_size = (
-                self.args.preq_embedding_quantize.split(",")
+                self.config.quantization.preq_embedding_quantize.split(",")
             )
             from .source_transformation.pre_quantization import (
                 transform_embedding_for_pre_quantization,
@@ -382,7 +386,7 @@ def _transform_for_pre_quantization(self, checkpoint, model_args):
             self.model_ = transform_embedding_for_pre_quantization(
                 self.model_,
                 checkpoint,
-                mapping[self.args.dtype_override],
+                mapping[self.config.model.dtype_override],
                 int(embedding_bit_width),
                 embedding_group_size,
             )

Original file line number	Diff line number	Diff line change
`@@ -1251,7 +1251,7 @@ def _load_llama_model(`
`1251`	`1251`	`input_prune_map_path=input_prune_map_path,`
`1252`	`1252`	`output_prune_map_path=output_prune_map_path,`
`1253`	`1253`	`dtype=torch_dtype,`
`1254`		`- args=config,`
	`1254`	`+ config=config,`
`1255`	`1255`	`)`
`1256`	`1256`	`)`
`1257`	`1257`