From dbfd3948ea7bad16d58f9170a1d6103b07e2e513 Mon Sep 17 00:00:00 2001 From: Yang Yong Date: Wed, 8 Jan 2025 17:03:00 +0800 Subject: [PATCH] Add wint4afp8 & fix fp quant bugs (#288) --- .../backend/sglang/fp8/awq_fp8.yml | 3 +- .../backend/sglang/fp8/awq_fp8_static.yml | 3 +- .../backend/sglang/fp8/gptq_fp8.yml | 3 +- .../backend/sglang/fp8/rtn_fp8.yml | 3 +- .../backend/sglang/fp8/smoothquant_fp8.yml | 3 +- .../quantization/backend/vllm/fp8/awq_fp8.yml | 3 +- .../backend/vllm/fp8/awq_fp8_static.yml | 3 +- .../backend/vllm/fp8/gptq_fp8.yml | 3 +- .../quantization/backend/vllm/fp8/rtn_fp8.yml | 3 +- .../backend/vllm/fp8/smoothquant_fp8.yml | 3 +- .../methods/FP_Quant/awq_we2m1a16_g128.yml | 3 +- .../methods/FP_Quant/gptq_we2m1a16_g128.yml | 3 +- .../methods/FP_Quant/rtn_we2m1a16_g128.yml | 2 +- .../methods/FP_Quant/rtn_we2m1ae2m1.yml | 3 +- .../methods/FP_Quant/rtn_we4m3ae4m3.yml | 27 +++++++------ .../methods/FP_Quant/rtn_we5m2ae5m2.yml | 3 +- .../methods/RTN/rtn_w_a_wint4afp8.yml | 40 +++++++++++++++++++ .../methods/RTN/rtn_w_a_wint4aint8.yml | 2 - llmc/compression/quantization/quant.py | 2 +- 19 files changed, 84 insertions(+), 31 deletions(-) create mode 100644 configs/quantization/methods/RTN/rtn_w_a_wint4afp8.yml diff --git a/configs/quantization/backend/sglang/fp8/awq_fp8.yml b/configs/quantization/backend/sglang/fp8/awq_fp8.yml index 8cdc4a63..b2f5396b 100644 --- a/configs/quantization/backend/sglang/fp8/awq_fp8.yml +++ b/configs/quantization/backend/sglang/fp8/awq_fp8.yml @@ -26,14 +26,15 @@ eval: inference_per_block: False quant: method: Awq - quant_type: float-quant weight: + quant_type: float-quant # Support ["e4m3", "e5m2"] bit: e4m3 symmetric: True granularity: per_channel use_qtorch: True act: + quant_type: float-quant # Support ["e4m3", "e5m2"] bit: e4m3 symmetric: True diff --git a/configs/quantization/backend/sglang/fp8/awq_fp8_static.yml b/configs/quantization/backend/sglang/fp8/awq_fp8_static.yml index 980c1940..6c86e55a 100644 --- a/configs/quantization/backend/sglang/fp8/awq_fp8_static.yml +++ b/configs/quantization/backend/sglang/fp8/awq_fp8_static.yml @@ -26,14 +26,15 @@ eval: inference_per_block: False quant: method: Awq - quant_type: float-quant weight: + quant_type: float-quant # Support ["e4m3", "e5m2"] bit: e4m3 symmetric: True granularity: per_tensor use_qtorch: True act: + quant_type: float-quant # Support ["e4m3", "e5m2"] bit: e4m3 symmetric: True diff --git a/configs/quantization/backend/sglang/fp8/gptq_fp8.yml b/configs/quantization/backend/sglang/fp8/gptq_fp8.yml index 0b592396..85b4bde2 100644 --- a/configs/quantization/backend/sglang/fp8/gptq_fp8.yml +++ b/configs/quantization/backend/sglang/fp8/gptq_fp8.yml @@ -26,14 +26,15 @@ eval: inference_per_block: False quant: method: GPTQ - quant_type: float-quant weight: + quant_type: float-quant # Support ["e4m3", "e5m2"] bit: e4m3 symmetric: True granularity: per_channel use_qtorch: True act: + quant_type: float-quant # Support ["e4m3", "e5m2"] bit: e4m3 symmetric: True diff --git a/configs/quantization/backend/sglang/fp8/rtn_fp8.yml b/configs/quantization/backend/sglang/fp8/rtn_fp8.yml index 2d34b706..849973fe 100644 --- a/configs/quantization/backend/sglang/fp8/rtn_fp8.yml +++ b/configs/quantization/backend/sglang/fp8/rtn_fp8.yml @@ -17,13 +17,14 @@ eval: inference_per_block: False quant: method: RTN - quant_type: float-quant weight: + quant_type: float-quant bit: e4m3 symmetric: True granularity: per_channel use_qtorch: True act: + quant_type: float-quant bit: e4m3 symmetric: True granularity: per_token diff --git a/configs/quantization/backend/sglang/fp8/smoothquant_fp8.yml b/configs/quantization/backend/sglang/fp8/smoothquant_fp8.yml index 85b823ad..e0caa7ae 100644 --- a/configs/quantization/backend/sglang/fp8/smoothquant_fp8.yml +++ b/configs/quantization/backend/sglang/fp8/smoothquant_fp8.yml @@ -22,14 +22,15 @@ eval: seq_len: 2048 quant: method: SmoothQuant - quant_type: float-quant weight: + quant_type: float-quant # Support ["e4m3", "e5m2"] bit: e4m3 symmetric: True granularity: per_channel use_qtorch: True act: + quant_type: float-quant # Support ["e4m3", "e5m2"] bit: e4m3 symmetric: True diff --git a/configs/quantization/backend/vllm/fp8/awq_fp8.yml b/configs/quantization/backend/vllm/fp8/awq_fp8.yml index 805c7f45..3a282259 100644 --- a/configs/quantization/backend/vllm/fp8/awq_fp8.yml +++ b/configs/quantization/backend/vllm/fp8/awq_fp8.yml @@ -26,14 +26,15 @@ eval: inference_per_block: False quant: method: Awq - quant_type: float-quant weight: + quant_type: float-quant # Support ["e4m3", "e5m2"] bit: e4m3 symmetric: True granularity: per_channel use_qtorch: True act: + quant_type: float-quant # Support ["e4m3", "e5m2"] bit: e4m3 symmetric: True diff --git a/configs/quantization/backend/vllm/fp8/awq_fp8_static.yml b/configs/quantization/backend/vllm/fp8/awq_fp8_static.yml index df0cb334..c4542507 100644 --- a/configs/quantization/backend/vllm/fp8/awq_fp8_static.yml +++ b/configs/quantization/backend/vllm/fp8/awq_fp8_static.yml @@ -26,14 +26,15 @@ eval: inference_per_block: False quant: method: Awq - quant_type: float-quant weight: + quant_type: float-quant # Support ["e4m3", "e5m2"] bit: e4m3 symmetric: True granularity: per_tensor use_qtorch: True act: + quant_type: float-quant # Support ["e4m3", "e5m2"] bit: e4m3 symmetric: True diff --git a/configs/quantization/backend/vllm/fp8/gptq_fp8.yml b/configs/quantization/backend/vllm/fp8/gptq_fp8.yml index 163f65c4..905be88a 100644 --- a/configs/quantization/backend/vllm/fp8/gptq_fp8.yml +++ b/configs/quantization/backend/vllm/fp8/gptq_fp8.yml @@ -26,14 +26,15 @@ eval: inference_per_block: False quant: method: GPTQ - quant_type: float_quant weight: + quant_type: float-quant # Support ["e4m3", "e5m2"] bit: e4m3 symmetric: True granularity: per_channel use_qtorch: True act: + quant_type: float-quant # Support ["e4m3", "e5m2"] bit: e4m3 symmetric: True diff --git a/configs/quantization/backend/vllm/fp8/rtn_fp8.yml b/configs/quantization/backend/vllm/fp8/rtn_fp8.yml index d982f5d6..f06f492e 100644 --- a/configs/quantization/backend/vllm/fp8/rtn_fp8.yml +++ b/configs/quantization/backend/vllm/fp8/rtn_fp8.yml @@ -17,13 +17,14 @@ eval: inference_per_block: False quant: method: RTN - quant_type: float-quant weight: + quant_type: float-quant bit: e4m3 symmetric: True granularity: per_channel use_qtorch: True act: + quant_type: float-quant bit: e4m3 symmetric: True granularity: per_token diff --git a/configs/quantization/backend/vllm/fp8/smoothquant_fp8.yml b/configs/quantization/backend/vllm/fp8/smoothquant_fp8.yml index 1c41dc1c..1c97ce11 100644 --- a/configs/quantization/backend/vllm/fp8/smoothquant_fp8.yml +++ b/configs/quantization/backend/vllm/fp8/smoothquant_fp8.yml @@ -22,14 +22,15 @@ eval: seq_len: 2048 quant: method: SmoothQuant - quant_type: float-quant weight: + quant_type: float-quant # Support ["e4m3", "e5m2"] bit: e4m3 symmetric: True granularity: per_channel use_qtorch: True act: + quant_type: float-quant # Support ["e4m3", "e5m2"] bit: e4m3 symmetric: True diff --git a/configs/quantization/methods/FP_Quant/awq_we2m1a16_g128.yml b/configs/quantization/methods/FP_Quant/awq_we2m1a16_g128.yml index 76d203dd..f79baeba 100644 --- a/configs/quantization/methods/FP_Quant/awq_we2m1a16_g128.yml +++ b/configs/quantization/methods/FP_Quant/awq_we2m1a16_g128.yml @@ -25,14 +25,15 @@ eval: inference_per_block: False quant: method: Awq - quant_type: float-quant weight: + quant_type: float-quant bit: e2m1 symmetric: False granularity: per_group group_size: 128 use_qtorch: True special: + quant_type: float-quant trans: True # The options for "trans_version" include "v1" and "v2". trans_version: v2 diff --git a/configs/quantization/methods/FP_Quant/gptq_we2m1a16_g128.yml b/configs/quantization/methods/FP_Quant/gptq_we2m1a16_g128.yml index f18de836..af783509 100644 --- a/configs/quantization/methods/FP_Quant/gptq_we2m1a16_g128.yml +++ b/configs/quantization/methods/FP_Quant/gptq_we2m1a16_g128.yml @@ -26,14 +26,15 @@ eval: inference_per_block: False quant: method: GPTQ - quant_type: float-quant weight: + quant_type: float-quant bit: e2m1 symmetric: True granularity: per_group group_size: 128 use_qtorch: True special: + quant_type: float-quant actorder: True static_groups: False percdamp: 0.01 diff --git a/configs/quantization/methods/FP_Quant/rtn_we2m1a16_g128.yml b/configs/quantization/methods/FP_Quant/rtn_we2m1a16_g128.yml index 5e2cc61e..c55be361 100644 --- a/configs/quantization/methods/FP_Quant/rtn_we2m1a16_g128.yml +++ b/configs/quantization/methods/FP_Quant/rtn_we2m1a16_g128.yml @@ -16,8 +16,8 @@ eval: inference_per_block: False quant: method: RTN - quant_type: float-quant weight: + quant_type: float-quant bit: e2m1 symmetric: True granularity: per_group diff --git a/configs/quantization/methods/FP_Quant/rtn_we2m1ae2m1.yml b/configs/quantization/methods/FP_Quant/rtn_we2m1ae2m1.yml index 53169b36..b5ea6fa2 100644 --- a/configs/quantization/methods/FP_Quant/rtn_we2m1ae2m1.yml +++ b/configs/quantization/methods/FP_Quant/rtn_we2m1ae2m1.yml @@ -16,12 +16,13 @@ eval: inference_per_block: False quant: method: RTN - quant_type: float-quant weight: + quant_type: float-quant bit: e2m1 symmetric: True granularity: per_channel act: + quant_type: float-quant bit: e2m1 symmetric: True granularity: per_token diff --git a/configs/quantization/methods/FP_Quant/rtn_we4m3ae4m3.yml b/configs/quantization/methods/FP_Quant/rtn_we4m3ae4m3.yml index c203eb19..8493780b 100644 --- a/configs/quantization/methods/FP_Quant/rtn_we4m3ae4m3.yml +++ b/configs/quantization/methods/FP_Quant/rtn_we4m3ae4m3.yml @@ -1,27 +1,28 @@ base: seed: &seed 42 model: - type: model_type - path: model path + type: Llama + path: /mnt/nvme1/yongyang/models/llama2-7b torch_dtype: auto -eval: - eval_pos: [pretrain, fake_quant] - name: wikitext2 - download: False - path: eval data path - seq_len: 2048 - # For 7B / 13B model eval, bs can be set to "1", and inference_per_block can be set to "False". - # For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True". - bs: 1 - inference_per_block: False +# eval: +# eval_pos: [pretrain, fake_quant] +# name: wikitext2 +# download: False +# path: /mnt/nvme0/yongyang/llm_datasets/llmc/eval/wikitext2 +# seq_len: 2048 +# # For 7B / 13B model eval, bs can be set to "1", and inference_per_block can be set to "False". +# # For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True". +# bs: 1 +# inference_per_block: False quant: method: RTN - quant_type: float-quant weight: + quant_type: float-quant bit: e4m3 symmetric: True granularity: per_channel act: + quant_type: float-quant bit: e4m3 symmetric: True granularity: per_token diff --git a/configs/quantization/methods/FP_Quant/rtn_we5m2ae5m2.yml b/configs/quantization/methods/FP_Quant/rtn_we5m2ae5m2.yml index d90675cb..ed3ea2f4 100644 --- a/configs/quantization/methods/FP_Quant/rtn_we5m2ae5m2.yml +++ b/configs/quantization/methods/FP_Quant/rtn_we5m2ae5m2.yml @@ -16,12 +16,13 @@ eval: inference_per_block: False quant: method: RTN - quant_type: float-quant weight: + quant_type: float-quant bit: e5m2 symmetric: True granularity: per_channel act: + quant_type: float-quant bit: e5m2 symmetric: True granularity: per_token diff --git a/configs/quantization/methods/RTN/rtn_w_a_wint4afp8.yml b/configs/quantization/methods/RTN/rtn_w_a_wint4afp8.yml new file mode 100644 index 00000000..4156850c --- /dev/null +++ b/configs/quantization/methods/RTN/rtn_w_a_wint4afp8.yml @@ -0,0 +1,40 @@ +base: + seed: &seed 42 +model: + type: Llama + path: /mnt/nvme1/yongyang/models/llama2-7b + torch_dtype: auto +eval: + eval_pos: [pretrain, fake_quant] + name: wikitext2 + download: False + path: /mnt/nvme0/yongyang/llm_datasets/llmc/eval/wikitext2 + seq_len: 2048 + # For 7B / 13B model eval, bs can be set to "1", and inference_per_block can be set to "False". + # For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True". + bs: 1 + inference_per_block: False +quant: + method: RTN + weight: + bit: 48 + bit4: + symmetric: False + granularity: per_group + group_size: 128 + scales_bit: 8 + scales_symmetric: True + zeros_bit: 8 + zeros_symmetric: True + bit8: + symmetric: True + granularity: per_channel + int_range: [-120, 120] + act: + quant_type: float-quant + bit: e4m3 + symmetric: True + granularity: per_token +save: + save_fake: False + save_path: /path/to/save/ diff --git a/configs/quantization/methods/RTN/rtn_w_a_wint4aint8.yml b/configs/quantization/methods/RTN/rtn_w_a_wint4aint8.yml index 0e105a70..237c2118 100644 --- a/configs/quantization/methods/RTN/rtn_w_a_wint4aint8.yml +++ b/configs/quantization/methods/RTN/rtn_w_a_wint4aint8.yml @@ -17,7 +17,6 @@ eval: quant: method: RTN weight: - quant_type: int-quant bit: 48 bit4: symmetric: False @@ -32,7 +31,6 @@ quant: granularity: per_channel int_range: [-120, 120] act: - quant_type: int-quant bit: 8 symmetric: True granularity: per_token diff --git a/llmc/compression/quantization/quant.py b/llmc/compression/quantization/quant.py index c0c8de1f..3d2192fe 100644 --- a/llmc/compression/quantization/quant.py +++ b/llmc/compression/quantization/quant.py @@ -37,7 +37,7 @@ def __init__(self, bit, symmetric, granularity, **kwargs): # hist config self.bins = self.kwargs.get('bins', 2048) self.hist_threshold = self.kwargs.get('hist_threshold', 1) - self.dst_nbins = 2**bit + self.dst_nbins = 2**bit if isinstance(bit, int) else None self.upsample_rate = ( 16 # used to reduce quantization errors when upscaling histogram )