update configs

ModelTC · Dec 13, 2024 · 7fa25ad · 7fa25ad
1 parent 691a82f
commit 7fa25ad
Show file tree

Hide file tree

Showing 47 changed files with 0 additions and 83 deletions.
diff --git a/configs/quantization/backend/autoawq/rtn_w4a16.yml b/configs/quantization/backend/autoawq/rtn_w4a16.yml
@@ -22,7 +22,6 @@ eval:
     bs: 1
     seq_len: 2048
     inference_per_block: False
-    eval_token_consist: True
 save:
     save_autoawq: True
     save_path: /path/to/save_for_autoawq_rtn_w4/
diff --git a/configs/quantization/backend/mlcllm/gptq_w4a16.yml b/configs/quantization/backend/mlcllm/gptq_w4a16.yml
@@ -22,7 +22,6 @@ eval:
     bs: 1
     seq_len: 2048
     inference_per_block: False
-    eval_token_consist: True
 quant:
     method: GPTQ
     weight:

diff --git a/configs/quantization/backend/mlcllm/rtn_w4a16.yml b/configs/quantization/backend/mlcllm/rtn_w4a16.yml
@@ -22,7 +22,6 @@ eval:
     bs: 1
     seq_len: 2048
     inference_per_block: False
-    eval_token_consist: True
 save:
     save_mlcllm: True
     save_path: /path/to/save_for_mlcllm_rtn_w4/
diff --git a/configs/quantization/backend/sglang/gptq_w4a16.yml b/configs/quantization/backend/sglang/gptq_w4a16.yml
@@ -21,7 +21,6 @@ eval:
     bs: 1
     seq_len: 2048
     inference_per_block: False
-    eval_token_consist: True
 quant:
     method: GPTQ
     weight:

diff --git a/configs/quantization/backend/sglang/rtn_w4a16.yml b/configs/quantization/backend/sglang/rtn_w4a16.yml
@@ -20,7 +20,6 @@ eval:
     bs: 1
     seq_len: 2048
     inference_per_block: False
-    eval_token_consist: True
 save:
     save_sgl: True
     save_path: /path/to/save_for_sgl_rtn/
diff --git a/configs/quantization/backend/sglang/rtn_w8a16.yml b/configs/quantization/backend/sglang/rtn_w8a16.yml
@@ -20,7 +20,6 @@ eval:
     bs: 1
     seq_len: 2048
     inference_per_block: False
-    eval_token_consist: True
 save:
     save_sgl: True
     save_path: /path/to/save_for_sgl_rtn_w8a16/
diff --git a/configs/quantization/backend/sglang/rtn_w8a8.yml b/configs/quantization/backend/sglang/rtn_w8a8.yml
@@ -23,7 +23,6 @@ eval:
     bs: 1
     seq_len: 2048
     inference_per_block: False
-    eval_token_consist: True
 save:
     save_sgl: True
     save_path: /path/to/save_for_sgl_rtn_w8a8/
diff --git a/configs/quantization/backend/vllm/gptq_w4a16.yml b/configs/quantization/backend/vllm/gptq_w4a16.yml
@@ -21,7 +21,6 @@ eval:
     bs: 1
     seq_len: 2048
     inference_per_block: False
-    eval_token_consist: True
 quant:
     method: GPTQ
     weight:

diff --git a/configs/quantization/backend/vllm/rtn_w4a16.yml b/configs/quantization/backend/vllm/rtn_w4a16.yml
@@ -20,7 +20,6 @@ eval:
     bs: 1
     seq_len: 2048
     inference_per_block: False
-    eval_token_consist: True
 save:
     save_vllm: True
     save_path: /path/to/save_for_vllm_rtn/
diff --git a/configs/quantization/backend/vllm/rtn_w8a16.yml b/configs/quantization/backend/vllm/rtn_w8a16.yml
@@ -20,7 +20,6 @@ eval:
     bs: 1
     seq_len: 2048
     inference_per_block: False
-    eval_token_consist: True
 save:
     save_vllm: True
     save_path: /path/to/save_for_vllm_rtn_w8a16/
diff --git a/configs/quantization/backend/vllm/rtn_w8a8.yml b/configs/quantization/backend/vllm/rtn_w8a8.yml
@@ -23,7 +23,6 @@ eval:
     bs: 1
     seq_len: 2048
     inference_per_block: False
-    eval_token_consist: True
 save:
     save_vllm: True
     save_path: /path/to/save_for_vllm_rtn_w8a8/
diff --git a/configs/quantization/methods/AdaDim/adadim_w_a.yml b/configs/quantization/methods/AdaDim/adadim_w_a.yml
@@ -24,8 +24,6 @@ eval:
     # For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
     bs: 1
     inference_per_block: False
-    # Consistency of tokens between original and fake-quantized model output.
-    eval_token_consist: True
 quant:
     method: AdaDim
     weight:

diff --git a/configs/quantization/methods/Awq/awq_w_a.yml b/configs/quantization/methods/Awq/awq_w_a.yml
@@ -24,8 +24,6 @@ eval:
     # For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
     bs: 1
     inference_per_block: False
-    # Consistency of tokens between original and fake-quantized model output.
-    eval_token_consist: True
 quant:
     method: Awq
     weight:

diff --git a/configs/quantization/methods/Awq/awq_w_a_chat_data.yml b/configs/quantization/methods/Awq/awq_w_a_chat_data.yml
@@ -25,8 +25,6 @@ eval:
     # For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
     bs: 1
     inference_per_block: False
-    # Consistency of tokens between original and fake-quantized model output.
-    eval_token_consist: True
 quant:
     method: Awq
     weight:

diff --git a/configs/quantization/methods/Awq/awq_w_a_mix_bits.yml b/configs/quantization/methods/Awq/awq_w_a_mix_bits.yml
@@ -24,8 +24,6 @@ eval:
     # For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
     bs: 1
     inference_per_block: False
-    # Consistency of tokens between original and fake-quantized model output.
-    eval_token_consist: True
 quant:
     method: Awq
     weight:

diff --git a/configs/quantization/methods/Awq/awq_w_only.yml b/configs/quantization/methods/Awq/awq_w_only.yml
@@ -24,8 +24,6 @@ eval:
     # For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
     bs: 1
     inference_per_block: False
-    # Consistency of tokens between original and fake-quantized model output.
-    eval_token_consist: True
 quant:
     method: Awq
     weight:

diff --git a/configs/quantization/methods/Awq/awq_w_only_custom_alm_data_padding.yml b/configs/quantization/methods/Awq/awq_w_only_custom_alm_data_padding.yml
@@ -26,8 +26,6 @@ eval:
     # For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
     bs: 1
     inference_per_block: False
-    # Consistency of tokens between original and fake-quantized model output.
-    eval_token_consist: False
 quant:
     method: Awq
     weight:

diff --git a/configs/quantization/methods/Awq/awq_w_only_custom_data_padding.yml b/configs/quantization/methods/Awq/awq_w_only_custom_data_padding.yml
@@ -24,8 +24,6 @@ eval:
     # For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
     bs: 1
     inference_per_block: False
-    # Consistency of tokens between original and fake-quantized model output.
-    eval_token_consist: False
 quant:
     method: Awq
     weight:

diff --git a/configs/quantization/methods/Awq/awq_w_only_custom_vlm_data_padding.yml b/configs/quantization/methods/Awq/awq_w_only_custom_vlm_data_padding.yml
@@ -26,8 +26,6 @@ eval:
     # For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
     bs: 1
     inference_per_block: False
-    # Consistency of tokens between original and fake-quantized model output.
-    eval_token_consist: False
 quant:
     method: Awq
     weight:

diff --git a/configs/quantization/methods/Awq/awq_w_only_mix_bits_1.yml b/configs/quantization/methods/Awq/awq_w_only_mix_bits_1.yml
@@ -24,8 +24,6 @@ eval:
     # For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
     bs: 1
     inference_per_block: False
-    # Consistency of tokens between original and fake-quantized model output.
-    eval_token_consist: True
 quant:
     method: Awq
     weight:

diff --git a/configs/quantization/methods/Awq/awq_w_only_mix_bits_2.yml b/configs/quantization/methods/Awq/awq_w_only_mix_bits_2.yml
@@ -24,8 +24,6 @@ eval:
     # For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
     bs: 1
     inference_per_block: False
-    # Consistency of tokens between original and fake-quantized model output.
-    eval_token_consist: True
 quant:
     method: Awq
     weight:

diff --git a/configs/quantization/methods/Awq/awq_w_only_opencompass.yml b/configs/quantization/methods/Awq/awq_w_only_opencompass.yml
@@ -24,8 +24,6 @@ eval:
     # For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
     bs: 1
     inference_per_block: False
-    # Consistency of tokens between original and fake-quantized model output.
-    eval_token_consist: True
 quant:
     method: Awq
     weight:

diff --git a/configs/quantization/methods/DGQ/dgq_w_a.yml b/configs/quantization/methods/DGQ/dgq_w_a.yml
@@ -23,8 +23,6 @@ eval:
     # For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
     bs: 1
     inference_per_block: False
-    # Consistency of tokens between original and fake-quantized model output.
-    eval_token_consist: True
 quant:
     method: DGQ
     weight:

diff --git a/configs/quantization/methods/FP_Quant/awq_we2m1a16_g128.yml b/configs/quantization/methods/FP_Quant/awq_we2m1a16_g128.yml
@@ -23,8 +23,6 @@ eval:
     # For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
     bs: 1
     inference_per_block: False
-    # Consistency of tokens between original and fake-quantized model output.
-    eval_token_consist: True
 quant:
     method: Awq
     quant_type: float-quant

diff --git a/configs/quantization/methods/FP_Quant/gptq_we2m1a16_g128.yml b/configs/quantization/methods/FP_Quant/gptq_we2m1a16_g128.yml
@@ -24,8 +24,6 @@ eval:
     # For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
     bs: 1
     inference_per_block: False
-    # Consistency of tokens between original and fake-quantized model output.
-    eval_token_consist: True
 quant:
     method: GPTQ
     quant_type: float-quant

diff --git a/configs/quantization/methods/FP_Quant/rtn_we2m1a16_g128.yml b/configs/quantization/methods/FP_Quant/rtn_we2m1a16_g128.yml
@@ -14,8 +14,6 @@ eval:
     # For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
     bs: 1
     inference_per_block: False
-    # Consistency of tokens between original and fake-quantized model output.
-    eval_token_consist: True
 quant:
     method: RTN
     quant_type: float-quant

diff --git a/configs/quantization/methods/FP_Quant/rtn_we2m1ae2m1.yml b/configs/quantization/methods/FP_Quant/rtn_we2m1ae2m1.yml
@@ -14,8 +14,6 @@ eval:
     # For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
     bs: 1
     inference_per_block: False
-    # Consistency of tokens between original and fake-quantized model output.
-    eval_token_consist: True
 quant:
     method: RTN
     quant_type: float-quant

diff --git a/configs/quantization/methods/FP_Quant/rtn_we4m3ae4m3.yml b/configs/quantization/methods/FP_Quant/rtn_we4m3ae4m3.yml
@@ -14,8 +14,6 @@ eval:
     # For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
     bs: 1
     inference_per_block: False
-    # Consistency of tokens between original and fake-quantized model output.
-    eval_token_consist: True
 quant:
     method: RTN
     quant_type: float-quant

diff --git a/configs/quantization/methods/FP_Quant/rtn_we5m2ae5m2.yml b/configs/quantization/methods/FP_Quant/rtn_we5m2ae5m2.yml
@@ -14,8 +14,6 @@ eval:
     # For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
     bs: 1
     inference_per_block: False
-    # Consistency of tokens between original and fake-quantized model output.
-    eval_token_consist: True
 quant:
     method: RTN
     quant_type: float-quant

diff --git a/configs/quantization/methods/GPTQ/gptq_owq_w_only.yml b/configs/quantization/methods/GPTQ/gptq_owq_w_only.yml
@@ -23,8 +23,6 @@ eval:
     # For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
     bs: 1
     inference_per_block: False
-    # Consistency of tokens between original and fake-quantized model output.
-    eval_token_consist: True
 quant:
     method: GPTQ
     weight:

diff --git a/configs/quantization/methods/GPTQ/gptq_w_only.yml b/configs/quantization/methods/GPTQ/gptq_w_only.yml
@@ -21,7 +21,6 @@ eval:
     bs: 1
     seq_len: 2048
     inference_per_block: False
-    eval_token_consist: True
 quant:
     method: GPTQ
     weight:

diff --git a/configs/quantization/methods/HQQ/hqq_w_only.yml b/configs/quantization/methods/HQQ/hqq_w_only.yml
@@ -14,8 +14,6 @@ eval:
     # For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
     bs: 1
     inference_per_block: False
-    # Consistency of tokens between original and fake-quantized model output.
-    eval_token_consist: True
 quant:
     method: HQQ
     weight:

diff --git a/configs/quantization/methods/LlmInt8/llmint8_w_only.yml b/configs/quantization/methods/LlmInt8/llmint8_w_only.yml
@@ -23,8 +23,6 @@ eval:
     # For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
     bs: 1
     inference_per_block: False
-    # Consistency of tokens between original and fake-quantized model output.
-    eval_token_consist: True
 quant:
     method: LlmInt8
     weight:

diff --git a/configs/quantization/methods/NormTweaking/ntweak_w_a.yml b/configs/quantization/methods/NormTweaking/ntweak_w_a.yml
@@ -23,8 +23,6 @@ eval:
     # For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
     bs: 1
     inference_per_block: False
-    # Consistency of tokens between original and fake-quantized model output.
-    eval_token_consist: True
 quant:
     method: NormTweaking
     weight:

diff --git a/configs/quantization/methods/NormTweaking/ntweak_w_only.yml b/configs/quantization/methods/NormTweaking/ntweak_w_only.yml
@@ -23,8 +23,6 @@ eval:
     # For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
     bs: 1
     inference_per_block: False
-    # Consistency of tokens between original and fake-quantized model output.
-    eval_token_consist: True
 quant:
     method: NormTweaking
     weight:

diff --git a/configs/quantization/methods/OmniQuant/omniq_w_a.yml b/configs/quantization/methods/OmniQuant/omniq_w_a.yml
@@ -23,8 +23,6 @@ eval:
     # For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
     bs: 1
     inference_per_block: False
-    # Consistency of tokens between original and fake-quantized model output.
-    eval_token_consist: True
 quant:
     method: OmniQuant
     weight:

diff --git a/configs/quantization/methods/OmniQuant/omniq_w_only.yml b/configs/quantization/methods/OmniQuant/omniq_w_only.yml
@@ -23,8 +23,6 @@ eval:
     # For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
     bs: 1
     inference_per_block: False
-    # Consistency of tokens between original and fake-quantized model output.
-    eval_token_consist: True
 quant:
     method: OmniQuant
     weight:

diff --git a/configs/quantization/methods/OsPlus/osplus_w_a.yml b/configs/quantization/methods/OsPlus/osplus_w_a.yml
@@ -23,8 +23,6 @@ eval:
     # For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
     bs: 1
     inference_per_block: False
-    # Consistency of tokens between original and fake-quantized model output.
-    eval_token_consist: True
 quant:
     method: OsPlus
     weight:

diff --git a/configs/quantization/methods/QUIK/quik_w_a.yml b/configs/quantization/methods/QUIK/quik_w_a.yml
@@ -23,8 +23,6 @@ eval:
     # For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
     bs: 1
     inference_per_block: False
-    # Consistency of tokens between original and fake-quantized model output.
-    eval_token_consist: True
 quant:
     method: QUIK
     weight:

diff --git a/configs/quantization/methods/QuaRot/quarot_w_a.yml b/configs/quantization/methods/QuaRot/quarot_w_a.yml
@@ -14,9 +14,6 @@ eval:
     # For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
     bs: 1
     inference_per_block: False
-    # Consistency of tokens between original and fake-quantized model output.
-    # Online rotation does not support evaluation with consistent tokens.
-    eval_token_consist: False
 quant:
     method: Quarot
     weight:

diff --git a/configs/quantization/methods/RTN/rtn_w_a.yml b/configs/quantization/methods/RTN/rtn_w_a.yml
@@ -14,8 +14,6 @@ eval:
     # For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
     bs: 1
     inference_per_block: False
-    # Consistency of tokens between original and fake-quantized model output.
-    eval_token_consist: True
 quant:
     method: RTN
     weight:

diff --git a/configs/quantization/methods/RTN/rtn_w_a_kv.yml b/configs/quantization/methods/RTN/rtn_w_a_kv.yml
@@ -14,8 +14,6 @@ eval:
     # For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
     bs: 1
     inference_per_block: False
-    # Consistency of tokens between original and fake-quantized model output.
-    eval_token_consist: True
 quant:
     method: RTN
     weight:

diff --git a/configs/quantization/methods/RTN/rtn_w_a_pertensor_static.yml b/configs/quantization/methods/RTN/rtn_w_a_pertensor_static.yml
@@ -23,8 +23,6 @@ eval:
     # For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
     bs: 1
     inference_per_block: False
-    # Consistency of tokens between original and fake-quantized model output.
-    eval_token_consist: True
 quant:
     method: RTN
     weight:

diff --git a/configs/quantization/methods/RTN/rtn_w_a_pertensor_static_kv.yml b/configs/quantization/methods/RTN/rtn_w_a_pertensor_static_kv.yml
@@ -23,8 +23,6 @@ eval:
     # For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
     bs: 1
     inference_per_block: False
-    # Consistency of tokens between original and fake-quantized model output.
-    eval_token_consist: True
 quant:
     method: RTN
     weight:

diff --git a/configs/quantization/methods/RTN/rtn_w_only.yml b/configs/quantization/methods/RTN/rtn_w_only.yml
@@ -14,8 +14,6 @@ eval:
     # For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
     bs: 1
     inference_per_block: False
-    # Consistency of tokens between original and fake-quantized model output.
-    eval_token_consist: True
 quant:
     method: RTN
     weight: