Skip to content

Commit

Permalink
update configs
Browse files Browse the repository at this point in the history
  • Loading branch information
helloyongyang committed Dec 13, 2024
1 parent 691a82f commit 7fa25ad
Show file tree
Hide file tree
Showing 47 changed files with 0 additions and 83 deletions.
1 change: 0 additions & 1 deletion configs/quantization/backend/autoawq/rtn_w4a16.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ eval:
bs: 1
seq_len: 2048
inference_per_block: False
eval_token_consist: True
save:
save_autoawq: True
save_path: /path/to/save_for_autoawq_rtn_w4/
1 change: 0 additions & 1 deletion configs/quantization/backend/mlcllm/gptq_w4a16.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ eval:
bs: 1
seq_len: 2048
inference_per_block: False
eval_token_consist: True
quant:
method: GPTQ
weight:
Expand Down
1 change: 0 additions & 1 deletion configs/quantization/backend/mlcllm/rtn_w4a16.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ eval:
bs: 1
seq_len: 2048
inference_per_block: False
eval_token_consist: True
save:
save_mlcllm: True
save_path: /path/to/save_for_mlcllm_rtn_w4/
1 change: 0 additions & 1 deletion configs/quantization/backend/sglang/gptq_w4a16.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ eval:
bs: 1
seq_len: 2048
inference_per_block: False
eval_token_consist: True
quant:
method: GPTQ
weight:
Expand Down
1 change: 0 additions & 1 deletion configs/quantization/backend/sglang/rtn_w4a16.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ eval:
bs: 1
seq_len: 2048
inference_per_block: False
eval_token_consist: True
save:
save_sgl: True
save_path: /path/to/save_for_sgl_rtn/
1 change: 0 additions & 1 deletion configs/quantization/backend/sglang/rtn_w8a16.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ eval:
bs: 1
seq_len: 2048
inference_per_block: False
eval_token_consist: True
save:
save_sgl: True
save_path: /path/to/save_for_sgl_rtn_w8a16/
1 change: 0 additions & 1 deletion configs/quantization/backend/sglang/rtn_w8a8.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ eval:
bs: 1
seq_len: 2048
inference_per_block: False
eval_token_consist: True
save:
save_sgl: True
save_path: /path/to/save_for_sgl_rtn_w8a8/
1 change: 0 additions & 1 deletion configs/quantization/backend/vllm/gptq_w4a16.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ eval:
bs: 1
seq_len: 2048
inference_per_block: False
eval_token_consist: True
quant:
method: GPTQ
weight:
Expand Down
1 change: 0 additions & 1 deletion configs/quantization/backend/vllm/rtn_w4a16.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ eval:
bs: 1
seq_len: 2048
inference_per_block: False
eval_token_consist: True
save:
save_vllm: True
save_path: /path/to/save_for_vllm_rtn/
1 change: 0 additions & 1 deletion configs/quantization/backend/vllm/rtn_w8a16.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ eval:
bs: 1
seq_len: 2048
inference_per_block: False
eval_token_consist: True
save:
save_vllm: True
save_path: /path/to/save_for_vllm_rtn_w8a16/
1 change: 0 additions & 1 deletion configs/quantization/backend/vllm/rtn_w8a8.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ eval:
bs: 1
seq_len: 2048
inference_per_block: False
eval_token_consist: True
save:
save_vllm: True
save_path: /path/to/save_for_vllm_rtn_w8a8/
2 changes: 0 additions & 2 deletions configs/quantization/methods/AdaDim/adadim_w_a.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@ eval:
# For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
bs: 1
inference_per_block: False
# Consistency of tokens between original and fake-quantized model output.
eval_token_consist: True
quant:
method: AdaDim
weight:
Expand Down
2 changes: 0 additions & 2 deletions configs/quantization/methods/Awq/awq_w_a.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@ eval:
# For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
bs: 1
inference_per_block: False
# Consistency of tokens between original and fake-quantized model output.
eval_token_consist: True
quant:
method: Awq
weight:
Expand Down
2 changes: 0 additions & 2 deletions configs/quantization/methods/Awq/awq_w_a_chat_data.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,6 @@ eval:
# For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
bs: 1
inference_per_block: False
# Consistency of tokens between original and fake-quantized model output.
eval_token_consist: True
quant:
method: Awq
weight:
Expand Down
2 changes: 0 additions & 2 deletions configs/quantization/methods/Awq/awq_w_a_mix_bits.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@ eval:
# For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
bs: 1
inference_per_block: False
# Consistency of tokens between original and fake-quantized model output.
eval_token_consist: True
quant:
method: Awq
weight:
Expand Down
2 changes: 0 additions & 2 deletions configs/quantization/methods/Awq/awq_w_only.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@ eval:
# For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
bs: 1
inference_per_block: False
# Consistency of tokens between original and fake-quantized model output.
eval_token_consist: True
quant:
method: Awq
weight:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,6 @@ eval:
# For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
bs: 1
inference_per_block: False
# Consistency of tokens between original and fake-quantized model output.
eval_token_consist: False
quant:
method: Awq
weight:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@ eval:
# For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
bs: 1
inference_per_block: False
# Consistency of tokens between original and fake-quantized model output.
eval_token_consist: False
quant:
method: Awq
weight:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,6 @@ eval:
# For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
bs: 1
inference_per_block: False
# Consistency of tokens between original and fake-quantized model output.
eval_token_consist: False
quant:
method: Awq
weight:
Expand Down
2 changes: 0 additions & 2 deletions configs/quantization/methods/Awq/awq_w_only_mix_bits_1.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@ eval:
# For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
bs: 1
inference_per_block: False
# Consistency of tokens between original and fake-quantized model output.
eval_token_consist: True
quant:
method: Awq
weight:
Expand Down
2 changes: 0 additions & 2 deletions configs/quantization/methods/Awq/awq_w_only_mix_bits_2.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@ eval:
# For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
bs: 1
inference_per_block: False
# Consistency of tokens between original and fake-quantized model output.
eval_token_consist: True
quant:
method: Awq
weight:
Expand Down
2 changes: 0 additions & 2 deletions configs/quantization/methods/Awq/awq_w_only_opencompass.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@ eval:
# For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
bs: 1
inference_per_block: False
# Consistency of tokens between original and fake-quantized model output.
eval_token_consist: True
quant:
method: Awq
weight:
Expand Down
2 changes: 0 additions & 2 deletions configs/quantization/methods/DGQ/dgq_w_a.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,6 @@ eval:
# For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
bs: 1
inference_per_block: False
# Consistency of tokens between original and fake-quantized model output.
eval_token_consist: True
quant:
method: DGQ
weight:
Expand Down
2 changes: 0 additions & 2 deletions configs/quantization/methods/FP_Quant/awq_we2m1a16_g128.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,6 @@ eval:
# For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
bs: 1
inference_per_block: False
# Consistency of tokens between original and fake-quantized model output.
eval_token_consist: True
quant:
method: Awq
quant_type: float-quant
Expand Down
2 changes: 0 additions & 2 deletions configs/quantization/methods/FP_Quant/gptq_we2m1a16_g128.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@ eval:
# For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
bs: 1
inference_per_block: False
# Consistency of tokens between original and fake-quantized model output.
eval_token_consist: True
quant:
method: GPTQ
quant_type: float-quant
Expand Down
2 changes: 0 additions & 2 deletions configs/quantization/methods/FP_Quant/rtn_we2m1a16_g128.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@ eval:
# For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
bs: 1
inference_per_block: False
# Consistency of tokens between original and fake-quantized model output.
eval_token_consist: True
quant:
method: RTN
quant_type: float-quant
Expand Down
2 changes: 0 additions & 2 deletions configs/quantization/methods/FP_Quant/rtn_we2m1ae2m1.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@ eval:
# For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
bs: 1
inference_per_block: False
# Consistency of tokens between original and fake-quantized model output.
eval_token_consist: True
quant:
method: RTN
quant_type: float-quant
Expand Down
2 changes: 0 additions & 2 deletions configs/quantization/methods/FP_Quant/rtn_we4m3ae4m3.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@ eval:
# For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
bs: 1
inference_per_block: False
# Consistency of tokens between original and fake-quantized model output.
eval_token_consist: True
quant:
method: RTN
quant_type: float-quant
Expand Down
2 changes: 0 additions & 2 deletions configs/quantization/methods/FP_Quant/rtn_we5m2ae5m2.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@ eval:
# For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
bs: 1
inference_per_block: False
# Consistency of tokens between original and fake-quantized model output.
eval_token_consist: True
quant:
method: RTN
quant_type: float-quant
Expand Down
2 changes: 0 additions & 2 deletions configs/quantization/methods/GPTQ/gptq_owq_w_only.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,6 @@ eval:
# For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
bs: 1
inference_per_block: False
# Consistency of tokens between original and fake-quantized model output.
eval_token_consist: True
quant:
method: GPTQ
weight:
Expand Down
1 change: 0 additions & 1 deletion configs/quantization/methods/GPTQ/gptq_w_only.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ eval:
bs: 1
seq_len: 2048
inference_per_block: False
eval_token_consist: True
quant:
method: GPTQ
weight:
Expand Down
2 changes: 0 additions & 2 deletions configs/quantization/methods/HQQ/hqq_w_only.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@ eval:
# For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
bs: 1
inference_per_block: False
# Consistency of tokens between original and fake-quantized model output.
eval_token_consist: True
quant:
method: HQQ
weight:
Expand Down
2 changes: 0 additions & 2 deletions configs/quantization/methods/LlmInt8/llmint8_w_only.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,6 @@ eval:
# For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
bs: 1
inference_per_block: False
# Consistency of tokens between original and fake-quantized model output.
eval_token_consist: True
quant:
method: LlmInt8
weight:
Expand Down
2 changes: 0 additions & 2 deletions configs/quantization/methods/NormTweaking/ntweak_w_a.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,6 @@ eval:
# For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
bs: 1
inference_per_block: False
# Consistency of tokens between original and fake-quantized model output.
eval_token_consist: True
quant:
method: NormTweaking
weight:
Expand Down
2 changes: 0 additions & 2 deletions configs/quantization/methods/NormTweaking/ntweak_w_only.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,6 @@ eval:
# For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
bs: 1
inference_per_block: False
# Consistency of tokens between original and fake-quantized model output.
eval_token_consist: True
quant:
method: NormTweaking
weight:
Expand Down
2 changes: 0 additions & 2 deletions configs/quantization/methods/OmniQuant/omniq_w_a.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,6 @@ eval:
# For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
bs: 1
inference_per_block: False
# Consistency of tokens between original and fake-quantized model output.
eval_token_consist: True
quant:
method: OmniQuant
weight:
Expand Down
2 changes: 0 additions & 2 deletions configs/quantization/methods/OmniQuant/omniq_w_only.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,6 @@ eval:
# For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
bs: 1
inference_per_block: False
# Consistency of tokens between original and fake-quantized model output.
eval_token_consist: True
quant:
method: OmniQuant
weight:
Expand Down
2 changes: 0 additions & 2 deletions configs/quantization/methods/OsPlus/osplus_w_a.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,6 @@ eval:
# For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
bs: 1
inference_per_block: False
# Consistency of tokens between original and fake-quantized model output.
eval_token_consist: True
quant:
method: OsPlus
weight:
Expand Down
2 changes: 0 additions & 2 deletions configs/quantization/methods/QUIK/quik_w_a.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,6 @@ eval:
# For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
bs: 1
inference_per_block: False
# Consistency of tokens between original and fake-quantized model output.
eval_token_consist: True
quant:
method: QUIK
weight:
Expand Down
3 changes: 0 additions & 3 deletions configs/quantization/methods/QuaRot/quarot_w_a.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,6 @@ eval:
# For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
bs: 1
inference_per_block: False
# Consistency of tokens between original and fake-quantized model output.
# Online rotation does not support evaluation with consistent tokens.
eval_token_consist: False
quant:
method: Quarot
weight:
Expand Down
2 changes: 0 additions & 2 deletions configs/quantization/methods/RTN/rtn_w_a.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@ eval:
# For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
bs: 1
inference_per_block: False
# Consistency of tokens between original and fake-quantized model output.
eval_token_consist: True
quant:
method: RTN
weight:
Expand Down
2 changes: 0 additions & 2 deletions configs/quantization/methods/RTN/rtn_w_a_kv.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@ eval:
# For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
bs: 1
inference_per_block: False
# Consistency of tokens between original and fake-quantized model output.
eval_token_consist: True
quant:
method: RTN
weight:
Expand Down
2 changes: 0 additions & 2 deletions configs/quantization/methods/RTN/rtn_w_a_pertensor_static.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,6 @@ eval:
# For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
bs: 1
inference_per_block: False
# Consistency of tokens between original and fake-quantized model output.
eval_token_consist: True
quant:
method: RTN
weight:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,6 @@ eval:
# For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
bs: 1
inference_per_block: False
# Consistency of tokens between original and fake-quantized model output.
eval_token_consist: True
quant:
method: RTN
weight:
Expand Down
2 changes: 0 additions & 2 deletions configs/quantization/methods/RTN/rtn_w_only.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@ eval:
# For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
bs: 1
inference_per_block: False
# Consistency of tokens between original and fake-quantized model output.
eval_token_consist: True
quant:
method: RTN
weight:
Expand Down
Loading

0 comments on commit 7fa25ad

Please sign in to comment.