Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update data format in llmc #259

Merged
merged 1 commit into from
Dec 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion configs/quantization/backend/autoawq/rtn_w4a16.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ eval:
bs: 1
seq_len: 2048
inference_per_block: False
eval_token_consist: True
save:
save_autoawq: True
save_path: /path/to/save_for_autoawq_rtn_w4/
1 change: 0 additions & 1 deletion configs/quantization/backend/mlcllm/gptq_w4a16.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ eval:
bs: 1
seq_len: 2048
inference_per_block: False
eval_token_consist: True
quant:
method: GPTQ
weight:
Expand Down
1 change: 0 additions & 1 deletion configs/quantization/backend/mlcllm/rtn_w4a16.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ eval:
bs: 1
seq_len: 2048
inference_per_block: False
eval_token_consist: True
save:
save_mlcllm: True
save_path: /path/to/save_for_mlcllm_rtn_w4/
1 change: 0 additions & 1 deletion configs/quantization/backend/sglang/gptq_w4a16.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ eval:
bs: 1
seq_len: 2048
inference_per_block: False
eval_token_consist: True
quant:
method: GPTQ
weight:
Expand Down
1 change: 0 additions & 1 deletion configs/quantization/backend/sglang/rtn_w4a16.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ eval:
bs: 1
seq_len: 2048
inference_per_block: False
eval_token_consist: True
save:
save_sgl: True
save_path: /path/to/save_for_sgl_rtn/
1 change: 0 additions & 1 deletion configs/quantization/backend/sglang/rtn_w8a16.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ eval:
bs: 1
seq_len: 2048
inference_per_block: False
eval_token_consist: True
save:
save_sgl: True
save_path: /path/to/save_for_sgl_rtn_w8a16/
1 change: 0 additions & 1 deletion configs/quantization/backend/sglang/rtn_w8a8.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ eval:
bs: 1
seq_len: 2048
inference_per_block: False
eval_token_consist: True
save:
save_sgl: True
save_path: /path/to/save_for_sgl_rtn_w8a8/
1 change: 0 additions & 1 deletion configs/quantization/backend/vllm/gptq_w4a16.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ eval:
bs: 1
seq_len: 2048
inference_per_block: False
eval_token_consist: True
quant:
method: GPTQ
weight:
Expand Down
1 change: 0 additions & 1 deletion configs/quantization/backend/vllm/rtn_w4a16.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ eval:
bs: 1
seq_len: 2048
inference_per_block: False
eval_token_consist: True
save:
save_vllm: True
save_path: /path/to/save_for_vllm_rtn/
1 change: 0 additions & 1 deletion configs/quantization/backend/vllm/rtn_w8a16.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ eval:
bs: 1
seq_len: 2048
inference_per_block: False
eval_token_consist: True
save:
save_vllm: True
save_path: /path/to/save_for_vllm_rtn_w8a16/
1 change: 0 additions & 1 deletion configs/quantization/backend/vllm/rtn_w8a8.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ eval:
bs: 1
seq_len: 2048
inference_per_block: False
eval_token_consist: True
save:
save_vllm: True
save_path: /path/to/save_for_vllm_rtn_w8a8/
2 changes: 0 additions & 2 deletions configs/quantization/methods/AdaDim/adadim_w_a.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@ eval:
# For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
bs: 1
inference_per_block: False
# Consistency of tokens between original and fake-quantized model output.
eval_token_consist: True
quant:
method: AdaDim
weight:
Expand Down
2 changes: 0 additions & 2 deletions configs/quantization/methods/Awq/awq_w_a.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@ eval:
# For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
bs: 1
inference_per_block: False
# Consistency of tokens between original and fake-quantized model output.
eval_token_consist: True
quant:
method: Awq
weight:
Expand Down
2 changes: 0 additions & 2 deletions configs/quantization/methods/Awq/awq_w_a_chat_data.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,6 @@ eval:
# For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
bs: 1
inference_per_block: False
# Consistency of tokens between original and fake-quantized model output.
eval_token_consist: True
quant:
method: Awq
weight:
Expand Down
2 changes: 0 additions & 2 deletions configs/quantization/methods/Awq/awq_w_a_mix_bits.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@ eval:
# For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
bs: 1
inference_per_block: False
# Consistency of tokens between original and fake-quantized model output.
eval_token_consist: True
quant:
method: Awq
weight:
Expand Down
2 changes: 0 additions & 2 deletions configs/quantization/methods/Awq/awq_w_only.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@ eval:
# For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
bs: 1
inference_per_block: False
# Consistency of tokens between original and fake-quantized model output.
eval_token_consist: True
quant:
method: Awq
weight:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,14 @@ model:
tokenizer_mode: slow
torch_dtype: auto
calib:
name: alm_datastes
type: audio_txt
name: custom_mm
download: False
path: calib data path
add_answer: False
n_samples: 128
apply_chat_template: True
add_answer: True # Defalut is False. If set it to Ture, calib data will add answers.
n_samples: 8
bs: -1
seq_len: 512
preproc: alm_general
padding: True
seed: *seed
eval:
Expand All @@ -27,8 +26,6 @@ eval:
# For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
bs: 1
inference_per_block: False
# Consistency of tokens between original and fake-quantized model output.
eval_token_consist: False
quant:
method: Awq
weight:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,14 @@ model:
torch_dtype: auto
use_cpu_to_save_cuda_mem_for_catcher: False
calib:
name: avlm_datastes
type: audio_img_txt
name: custom_mm
download: False
path: calib data path
add_answer: False
n_samples: 128
apply_chat_template: True
add_answer: True # Defalut is False. If set it to Ture, calib data will add answers.
n_samples: 8
bs: -1
seq_len: 512
preproc: avlm_general
padding: True
seed: *seed
eval:
Expand All @@ -28,8 +27,6 @@ eval:
# For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
bs: 1
inference_per_block: False
# Consistency of tokens between original and fake-quantized model output.
eval_token_consist: False
quant:
method: Awq
weight:
Expand Down
47 changes: 32 additions & 15 deletions configs/quantization/methods/Awq/awq_w_only_custom_data.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,27 +6,44 @@ model:
tokenizer_mode: slow
torch_dtype: auto
calib:
name: custom
name: custom_txt
download: False
load_from_txt: True
path: ./inputs.txt
path: calib data path
apply_chat_template: True
n_samples: 128
bs: -1
seq_len: 512
bs: -1
preproc: random_truncate_txt
seed: *seed
eval:
eval_pos: [pretrain, transformed, fake_quant]
name: wikitext2
download: False
path: eval data path
seq_len: 2048
# For 7B / 13B model eval, bs can be set to "1", and inference_per_block can be set to "False".
# For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
bs: 1
inference_per_block: False
# Consistency of tokens between original and fake-quantized model output.
eval_token_consist: True
- eval_pos: [pretrain, transformed, fake_quant]
name: custom_gen
type: generate_only
max_new_tokens: 32
bs: 1
download: False
path: /data/yongyang/datasets/general_custom_data
apply_chat_template: True
inference_per_block: False
- eval_pos: [pretrain, transformed, fake_quant]
name: wikitext2
type: token_acc
download: False
path: /data/yongyang/datasets/llmc/eval/wikitext2
seq_len: 2048
# For 7B / 13B model eval, bs can be set to "1", and inference_per_block can be set to "False".
# For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
bs: 1
inference_per_block: False
- eval_pos: [pretrain, transformed, fake_quant]
name: wikitext2
download: False
path: /data/yongyang/datasets/llmc/eval/wikitext2
seq_len: 2048
# For 7B / 13B model eval, bs can be set to "1", and inference_per_block can be set to "False".
# For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
bs: 1
inference_per_block: False
quant:
method: Awq
weight:
Expand Down
110 changes: 110 additions & 0 deletions configs/quantization/methods/Awq/awq_w_only_custom_data_debug.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
base:
seed: &seed 42
model:
# type: Qwen2
# path: /data/yongyang/models/qwen25/Qwen2.5-0.5B-Instruct
type: Qwen2VL
path: /data/yongyang/models/Qwen2-VL-2B-Instruct
# type: Llava
# path: /data/yongyang/models/llava-1.5-7b-hf
# type: InternVL2
# path: /data/yongyang/models/InternVL2-2B
# type: Qwen2Audio
# path: /data/yongyang/models/Qwen2-Audio-7B-Instruct
# type: InternOmni
# path: /data/yongyang/models/InternOmni
# type: Llama
# path: /data/yongyang/models/Meta-Llama-3.1-8B-Instruct
# type: InternLM2
# path: /data/yongyang/models/internlm2-chat-1_8b
# type: DeepseekV2
# path: /data/yongyang/models/DeepSeek-V2-Lite-Chat
tokenizer_mode: fast
torch_dtype: auto
# calib:
# name: pileval
# download: False
# path: /data/yongyang/datasets/llmc/calib/pileval
# n_samples: 2
# bs: -1
# seq_len: 512
# preproc: txt_general_preproc
# seed: *seed
# calib:
# name: custom_txt
# download: False
# path: /data/yongyang/datasets/general_custom_data
# apply_chat_template: True
# n_samples: 8
# bs: -1
# padding: True
# seed: *seed
# calib:
# name: custom_txt
# download: False
# path: /data/yongyang/datasets/general_custom_data
# apply_chat_template: True
# n_samples: 8
# seq_len: 3
# bs: -1
# preproc: random_truncate_txt
# seed: *seed
calib:
name: custom_mm
download: False
path: /data/yongyang/datasets/general_custom_data
apply_chat_template: True
add_answer: True # Defalut is False. If set it to Ture, calib data will add answers.
n_samples: 8
bs: -1
seq_len: 512
padding: True
seed: *seed
eval:
- eval_pos: [pretrain, transformed, fake_quant]
name: custom_gen
type: generate_only
max_new_tokens: 32
bs: 1
download: False
path: /data/yongyang/datasets/general_custom_data
apply_chat_template: True
inference_per_block: False
- eval_pos: [pretrain, transformed, fake_quant]
name: wikitext2
type: token_acc
download: False
path: /data/yongyang/datasets/llmc/eval/wikitext2
seq_len: 2048
# For 7B / 13B model eval, bs can be set to "1", and inference_per_block can be set to "False".
# For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
bs: 1
inference_per_block: False
- eval_pos: [pretrain, transformed, fake_quant]
name: wikitext2
download: False
path: /data/yongyang/datasets/llmc/eval/wikitext2
seq_len: 2048
# For 7B / 13B model eval, bs can be set to "1", and inference_per_block can be set to "False".
# For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
bs: 1
inference_per_block: False
quant:
method: Awq
weight:
bit: 4
symmetric: False
granularity: per_group
group_size: 128
special:
trans: True
# The options for "trans_version" include "v1" and "v2".
# But their results don't differ significantly.
trans_version: v2
weight_clip: False
# For 2-bit quantization, setting "clip_sym: False" will yield better results.
clip_sym: True
save:
save_trans: False
save_fake: False
save_path: /path/to/save/
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,12 @@ model:
tokenizer_mode: slow
torch_dtype: auto
calib:
name: custom
name: custom_txt
download: False
load_from_txt: True
path: ./inputs.txt
n_samples: 128
path: calib data path
apply_chat_template: True
n_samples: 8
bs: -1
preproc: original_txt
padding: True
seed: *seed
eval:
Expand All @@ -25,8 +24,6 @@ eval:
# For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
bs: 1
inference_per_block: False
# Consistency of tokens between original and fake-quantized model output.
eval_token_consist: False
quant:
method: Awq
weight:
Expand Down
Loading
Loading