From dbfd3948ea7bad16d58f9170a1d6103b07e2e513 Mon Sep 17 00:00:00 2001
From: Yang Yong <yongyang1030@163.com>
Date: Wed, 8 Jan 2025 17:03:00 +0800
Subject: [PATCH] Add wint4afp8 & fix fp quant bugs (#288)

---
 .../backend/sglang/fp8/awq_fp8.yml            |  3 +-
 .../backend/sglang/fp8/awq_fp8_static.yml     |  3 +-
 .../backend/sglang/fp8/gptq_fp8.yml           |  3 +-
 .../backend/sglang/fp8/rtn_fp8.yml            |  3 +-
 .../backend/sglang/fp8/smoothquant_fp8.yml    |  3 +-
 .../quantization/backend/vllm/fp8/awq_fp8.yml |  3 +-
 .../backend/vllm/fp8/awq_fp8_static.yml       |  3 +-
 .../backend/vllm/fp8/gptq_fp8.yml             |  3 +-
 .../quantization/backend/vllm/fp8/rtn_fp8.yml |  3 +-
 .../backend/vllm/fp8/smoothquant_fp8.yml      |  3 +-
 .../methods/FP_Quant/awq_we2m1a16_g128.yml    |  3 +-
 .../methods/FP_Quant/gptq_we2m1a16_g128.yml   |  3 +-
 .../methods/FP_Quant/rtn_we2m1a16_g128.yml    |  2 +-
 .../methods/FP_Quant/rtn_we2m1ae2m1.yml       |  3 +-
 .../methods/FP_Quant/rtn_we4m3ae4m3.yml       | 27 +++++++------
 .../methods/FP_Quant/rtn_we5m2ae5m2.yml       |  3 +-
 .../methods/RTN/rtn_w_a_wint4afp8.yml         | 40 +++++++++++++++++++
 .../methods/RTN/rtn_w_a_wint4aint8.yml        |  2 -
 llmc/compression/quantization/quant.py        |  2 +-
 19 files changed, 84 insertions(+), 31 deletions(-)
 create mode 100644 configs/quantization/methods/RTN/rtn_w_a_wint4afp8.yml

diff --git a/configs/quantization/backend/sglang/fp8/awq_fp8.yml b/configs/quantization/backend/sglang/fp8/awq_fp8.yml
index 8cdc4a63..b2f5396b 100644
--- a/configs/quantization/backend/sglang/fp8/awq_fp8.yml
+++ b/configs/quantization/backend/sglang/fp8/awq_fp8.yml
@@ -26,14 +26,15 @@ eval:
     inference_per_block: False
 quant:
     method: Awq
-    quant_type: float-quant
     weight:
+        quant_type: float-quant
         # Support ["e4m3", "e5m2"]
         bit: e4m3
         symmetric: True
         granularity: per_channel
         use_qtorch: True
     act:
+        quant_type: float-quant
         # Support ["e4m3", "e5m2"]
         bit: e4m3
         symmetric: True
diff --git a/configs/quantization/backend/sglang/fp8/awq_fp8_static.yml b/configs/quantization/backend/sglang/fp8/awq_fp8_static.yml
index 980c1940..6c86e55a 100644
--- a/configs/quantization/backend/sglang/fp8/awq_fp8_static.yml
+++ b/configs/quantization/backend/sglang/fp8/awq_fp8_static.yml
@@ -26,14 +26,15 @@ eval:
     inference_per_block: False
 quant:
     method: Awq
-    quant_type: float-quant
     weight:
+        quant_type: float-quant
         # Support ["e4m3", "e5m2"]
         bit: e4m3
         symmetric: True
         granularity: per_tensor
         use_qtorch: True
     act:
+        quant_type: float-quant
         # Support ["e4m3", "e5m2"]
         bit: e4m3
         symmetric: True
diff --git a/configs/quantization/backend/sglang/fp8/gptq_fp8.yml b/configs/quantization/backend/sglang/fp8/gptq_fp8.yml
index 0b592396..85b4bde2 100644
--- a/configs/quantization/backend/sglang/fp8/gptq_fp8.yml
+++ b/configs/quantization/backend/sglang/fp8/gptq_fp8.yml
@@ -26,14 +26,15 @@ eval:
     inference_per_block: False
 quant:
     method: GPTQ
-    quant_type: float-quant
     weight:
+        quant_type: float-quant
         # Support ["e4m3", "e5m2"]
         bit: e4m3
         symmetric: True
         granularity: per_channel
         use_qtorch: True
     act:
+        quant_type: float-quant
         # Support ["e4m3", "e5m2"]
         bit: e4m3
         symmetric: True
diff --git a/configs/quantization/backend/sglang/fp8/rtn_fp8.yml b/configs/quantization/backend/sglang/fp8/rtn_fp8.yml
index 2d34b706..849973fe 100644
--- a/configs/quantization/backend/sglang/fp8/rtn_fp8.yml
+++ b/configs/quantization/backend/sglang/fp8/rtn_fp8.yml
@@ -17,13 +17,14 @@ eval:
     inference_per_block: False
 quant:
     method: RTN
-    quant_type: float-quant
     weight:
+        quant_type: float-quant
         bit: e4m3
         symmetric: True
         granularity: per_channel
         use_qtorch: True
     act:
+        quant_type: float-quant
         bit: e4m3
         symmetric: True
         granularity: per_token
diff --git a/configs/quantization/backend/sglang/fp8/smoothquant_fp8.yml b/configs/quantization/backend/sglang/fp8/smoothquant_fp8.yml
index 85b823ad..e0caa7ae 100644
--- a/configs/quantization/backend/sglang/fp8/smoothquant_fp8.yml
+++ b/configs/quantization/backend/sglang/fp8/smoothquant_fp8.yml
@@ -22,14 +22,15 @@ eval:
     seq_len: 2048
 quant:
     method: SmoothQuant
-    quant_type: float-quant
     weight:
+        quant_type: float-quant
         # Support ["e4m3", "e5m2"]
         bit: e4m3
         symmetric: True
         granularity: per_channel
         use_qtorch: True
     act:
+        quant_type: float-quant
         # Support ["e4m3", "e5m2"]
         bit: e4m3
         symmetric: True
diff --git a/configs/quantization/backend/vllm/fp8/awq_fp8.yml b/configs/quantization/backend/vllm/fp8/awq_fp8.yml
index 805c7f45..3a282259 100644
--- a/configs/quantization/backend/vllm/fp8/awq_fp8.yml
+++ b/configs/quantization/backend/vllm/fp8/awq_fp8.yml
@@ -26,14 +26,15 @@ eval:
     inference_per_block: False
 quant:
     method: Awq
-    quant_type: float-quant
     weight:
+        quant_type: float-quant
         # Support ["e4m3", "e5m2"]
         bit: e4m3
         symmetric: True
         granularity: per_channel
         use_qtorch: True
     act:
+        quant_type: float-quant
         # Support ["e4m3", "e5m2"]
         bit: e4m3
         symmetric: True
diff --git a/configs/quantization/backend/vllm/fp8/awq_fp8_static.yml b/configs/quantization/backend/vllm/fp8/awq_fp8_static.yml
index df0cb334..c4542507 100644
--- a/configs/quantization/backend/vllm/fp8/awq_fp8_static.yml
+++ b/configs/quantization/backend/vllm/fp8/awq_fp8_static.yml
@@ -26,14 +26,15 @@ eval:
     inference_per_block: False
 quant:
     method: Awq
-    quant_type: float-quant
     weight:
+        quant_type: float-quant
         # Support ["e4m3", "e5m2"]
         bit: e4m3
         symmetric: True
         granularity: per_tensor
         use_qtorch: True
     act:
+        quant_type: float-quant
         # Support ["e4m3", "e5m2"]
         bit: e4m3
         symmetric: True
diff --git a/configs/quantization/backend/vllm/fp8/gptq_fp8.yml b/configs/quantization/backend/vllm/fp8/gptq_fp8.yml
index 163f65c4..905be88a 100644
--- a/configs/quantization/backend/vllm/fp8/gptq_fp8.yml
+++ b/configs/quantization/backend/vllm/fp8/gptq_fp8.yml
@@ -26,14 +26,15 @@ eval:
     inference_per_block: False
 quant:
     method: GPTQ
-    quant_type: float_quant
     weight:
+        quant_type: float-quant
         # Support ["e4m3", "e5m2"]
         bit: e4m3
         symmetric: True
         granularity: per_channel
         use_qtorch: True
     act:
+        quant_type: float-quant
         # Support ["e4m3", "e5m2"]
         bit: e4m3
         symmetric: True
diff --git a/configs/quantization/backend/vllm/fp8/rtn_fp8.yml b/configs/quantization/backend/vllm/fp8/rtn_fp8.yml
index d982f5d6..f06f492e 100644
--- a/configs/quantization/backend/vllm/fp8/rtn_fp8.yml
+++ b/configs/quantization/backend/vllm/fp8/rtn_fp8.yml
@@ -17,13 +17,14 @@ eval:
     inference_per_block: False
 quant:
     method: RTN
-    quant_type: float-quant
     weight:
+        quant_type: float-quant
         bit: e4m3
         symmetric: True
         granularity: per_channel
         use_qtorch: True
     act:
+        quant_type: float-quant
         bit: e4m3
         symmetric: True
         granularity: per_token
diff --git a/configs/quantization/backend/vllm/fp8/smoothquant_fp8.yml b/configs/quantization/backend/vllm/fp8/smoothquant_fp8.yml
index 1c41dc1c..1c97ce11 100644
--- a/configs/quantization/backend/vllm/fp8/smoothquant_fp8.yml
+++ b/configs/quantization/backend/vllm/fp8/smoothquant_fp8.yml
@@ -22,14 +22,15 @@ eval:
     seq_len: 2048
 quant:
     method: SmoothQuant
-    quant_type: float-quant
     weight:
+        quant_type: float-quant
         # Support ["e4m3", "e5m2"]
         bit: e4m3
         symmetric: True
         granularity: per_channel
         use_qtorch: True
     act:
+        quant_type: float-quant
         # Support ["e4m3", "e5m2"]
         bit: e4m3
         symmetric: True
diff --git a/configs/quantization/methods/FP_Quant/awq_we2m1a16_g128.yml b/configs/quantization/methods/FP_Quant/awq_we2m1a16_g128.yml
index 76d203dd..f79baeba 100644
--- a/configs/quantization/methods/FP_Quant/awq_we2m1a16_g128.yml
+++ b/configs/quantization/methods/FP_Quant/awq_we2m1a16_g128.yml
@@ -25,14 +25,15 @@ eval:
     inference_per_block: False
 quant:
     method: Awq
-    quant_type: float-quant
     weight:
+        quant_type: float-quant
         bit: e2m1
         symmetric: False
         granularity: per_group
         group_size: 128
         use_qtorch: True
     special:
+        quant_type: float-quant
         trans: True
         # The options for "trans_version" include "v1" and "v2".
         trans_version: v2
diff --git a/configs/quantization/methods/FP_Quant/gptq_we2m1a16_g128.yml b/configs/quantization/methods/FP_Quant/gptq_we2m1a16_g128.yml
index f18de836..af783509 100644
--- a/configs/quantization/methods/FP_Quant/gptq_we2m1a16_g128.yml
+++ b/configs/quantization/methods/FP_Quant/gptq_we2m1a16_g128.yml
@@ -26,14 +26,15 @@ eval:
     inference_per_block: False
 quant:
     method: GPTQ
-    quant_type: float-quant
     weight:
+        quant_type: float-quant
         bit: e2m1
         symmetric: True
         granularity: per_group
         group_size: 128
         use_qtorch: True
     special:
+        quant_type: float-quant
         actorder: True
         static_groups: False
         percdamp: 0.01
diff --git a/configs/quantization/methods/FP_Quant/rtn_we2m1a16_g128.yml b/configs/quantization/methods/FP_Quant/rtn_we2m1a16_g128.yml
index 5e2cc61e..c55be361 100644
--- a/configs/quantization/methods/FP_Quant/rtn_we2m1a16_g128.yml
+++ b/configs/quantization/methods/FP_Quant/rtn_we2m1a16_g128.yml
@@ -16,8 +16,8 @@ eval:
     inference_per_block: False
 quant:
     method: RTN
-    quant_type: float-quant
     weight:
+        quant_type: float-quant
         bit: e2m1
         symmetric: True
         granularity: per_group
diff --git a/configs/quantization/methods/FP_Quant/rtn_we2m1ae2m1.yml b/configs/quantization/methods/FP_Quant/rtn_we2m1ae2m1.yml
index 53169b36..b5ea6fa2 100644
--- a/configs/quantization/methods/FP_Quant/rtn_we2m1ae2m1.yml
+++ b/configs/quantization/methods/FP_Quant/rtn_we2m1ae2m1.yml
@@ -16,12 +16,13 @@ eval:
     inference_per_block: False
 quant:
     method: RTN
-    quant_type: float-quant
     weight:
+        quant_type: float-quant
         bit: e2m1
         symmetric: True
         granularity: per_channel
     act:
+        quant_type: float-quant
         bit: e2m1
         symmetric: True
         granularity: per_token
diff --git a/configs/quantization/methods/FP_Quant/rtn_we4m3ae4m3.yml b/configs/quantization/methods/FP_Quant/rtn_we4m3ae4m3.yml
index c203eb19..8493780b 100644
--- a/configs/quantization/methods/FP_Quant/rtn_we4m3ae4m3.yml
+++ b/configs/quantization/methods/FP_Quant/rtn_we4m3ae4m3.yml
@@ -1,27 +1,28 @@
 base:
     seed: &seed 42
 model:
-    type: model_type
-    path: model path
+    type: Llama
+    path: /mnt/nvme1/yongyang/models/llama2-7b
     torch_dtype: auto
-eval:
-    eval_pos: [pretrain, fake_quant]
-    name: wikitext2
-    download: False
-    path: eval data path
-    seq_len: 2048
-    # For 7B / 13B model eval, bs can be set to "1", and inference_per_block can be set to "False".
-    # For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
-    bs: 1
-    inference_per_block: False
+# eval:
+#     eval_pos: [pretrain, fake_quant]
+#     name: wikitext2
+#     download: False
+#     path: /mnt/nvme0/yongyang/llm_datasets/llmc/eval/wikitext2
+#     seq_len: 2048
+#     # For 7B / 13B model eval, bs can be set to "1", and inference_per_block can be set to "False".
+#     # For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
+#     bs: 1
+#     inference_per_block: False
 quant:
     method: RTN
-    quant_type: float-quant
     weight:
+        quant_type: float-quant
         bit: e4m3
         symmetric: True
         granularity: per_channel
     act:
+        quant_type: float-quant
         bit: e4m3
         symmetric: True
         granularity: per_token
diff --git a/configs/quantization/methods/FP_Quant/rtn_we5m2ae5m2.yml b/configs/quantization/methods/FP_Quant/rtn_we5m2ae5m2.yml
index d90675cb..ed3ea2f4 100644
--- a/configs/quantization/methods/FP_Quant/rtn_we5m2ae5m2.yml
+++ b/configs/quantization/methods/FP_Quant/rtn_we5m2ae5m2.yml
@@ -16,12 +16,13 @@ eval:
     inference_per_block: False
 quant:
     method: RTN
-    quant_type: float-quant
     weight:
+        quant_type: float-quant
         bit: e5m2
         symmetric: True
         granularity: per_channel
     act:
+        quant_type: float-quant
         bit: e5m2
         symmetric: True
         granularity: per_token
diff --git a/configs/quantization/methods/RTN/rtn_w_a_wint4afp8.yml b/configs/quantization/methods/RTN/rtn_w_a_wint4afp8.yml
new file mode 100644
index 00000000..4156850c
--- /dev/null
+++ b/configs/quantization/methods/RTN/rtn_w_a_wint4afp8.yml
@@ -0,0 +1,40 @@
+base:
+    seed: &seed 42
+model:
+    type: Llama
+    path: /mnt/nvme1/yongyang/models/llama2-7b
+    torch_dtype: auto
+eval:
+    eval_pos: [pretrain, fake_quant]
+    name: wikitext2
+    download: False
+    path: /mnt/nvme0/yongyang/llm_datasets/llmc/eval/wikitext2
+    seq_len: 2048
+    # For 7B / 13B model eval, bs can be set to "1", and inference_per_block can be set to "False".
+    # For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
+    bs: 1
+    inference_per_block: False
+quant:
+    method: RTN
+    weight:
+        bit: 48
+        bit4:
+            symmetric: False
+            granularity: per_group
+            group_size: 128
+            scales_bit: 8
+            scales_symmetric: True
+            zeros_bit: 8
+            zeros_symmetric: True
+        bit8:
+            symmetric: True
+            granularity: per_channel
+            int_range: [-120, 120]
+    act:
+        quant_type: float-quant
+        bit: e4m3
+        symmetric: True
+        granularity: per_token
+save:
+    save_fake: False
+    save_path: /path/to/save/
diff --git a/configs/quantization/methods/RTN/rtn_w_a_wint4aint8.yml b/configs/quantization/methods/RTN/rtn_w_a_wint4aint8.yml
index 0e105a70..237c2118 100644
--- a/configs/quantization/methods/RTN/rtn_w_a_wint4aint8.yml
+++ b/configs/quantization/methods/RTN/rtn_w_a_wint4aint8.yml
@@ -17,7 +17,6 @@ eval:
 quant:
     method: RTN
     weight:
-        quant_type: int-quant
         bit: 48
         bit4:
             symmetric: False
@@ -32,7 +31,6 @@ quant:
             granularity: per_channel
             int_range: [-120, 120]
     act:
-        quant_type: int-quant
         bit: 8
         symmetric: True
         granularity: per_token
diff --git a/llmc/compression/quantization/quant.py b/llmc/compression/quantization/quant.py
index c0c8de1f..3d2192fe 100644
--- a/llmc/compression/quantization/quant.py
+++ b/llmc/compression/quantization/quant.py
@@ -37,7 +37,7 @@ def __init__(self, bit, symmetric, granularity, **kwargs):
         # hist config
         self.bins = self.kwargs.get('bins', 2048)
         self.hist_threshold = self.kwargs.get('hist_threshold', 1)
-        self.dst_nbins = 2**bit
+        self.dst_nbins = 2**bit if isinstance(bit, int) else None
         self.upsample_rate = (
             16  # used to reduce quantization errors when upscaling histogram
         )