Skip to content

Commit

Permalink
Big Refactor for data format in llmc
Browse files Browse the repository at this point in the history
  • Loading branch information
helloyongyang committed Dec 13, 2024
1 parent 5e3361c commit dc3b11b
Show file tree
Hide file tree
Showing 76 changed files with 768 additions and 648 deletions.
1 change: 0 additions & 1 deletion configs/quantization/backend/autoawq/rtn_w4a16.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ eval:
bs: 1
seq_len: 2048
inference_per_block: False
eval_token_consist: True
save:
save_autoawq: True
save_path: /path/to/save_for_autoawq_rtn_w4/
1 change: 0 additions & 1 deletion configs/quantization/backend/mlcllm/gptq_w4a16.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ eval:
bs: 1
seq_len: 2048
inference_per_block: False
eval_token_consist: True
quant:
method: GPTQ
weight:
Expand Down
1 change: 0 additions & 1 deletion configs/quantization/backend/mlcllm/rtn_w4a16.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ eval:
bs: 1
seq_len: 2048
inference_per_block: False
eval_token_consist: True
save:
save_mlcllm: True
save_path: /path/to/save_for_mlcllm_rtn_w4/
1 change: 0 additions & 1 deletion configs/quantization/backend/sglang/gptq_w4a16.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ eval:
bs: 1
seq_len: 2048
inference_per_block: False
eval_token_consist: True
quant:
method: GPTQ
weight:
Expand Down
1 change: 0 additions & 1 deletion configs/quantization/backend/sglang/rtn_w4a16.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ eval:
bs: 1
seq_len: 2048
inference_per_block: False
eval_token_consist: True
save:
save_sgl: True
save_path: /path/to/save_for_sgl_rtn/
1 change: 0 additions & 1 deletion configs/quantization/backend/sglang/rtn_w8a16.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ eval:
bs: 1
seq_len: 2048
inference_per_block: False
eval_token_consist: True
save:
save_sgl: True
save_path: /path/to/save_for_sgl_rtn_w8a16/
1 change: 0 additions & 1 deletion configs/quantization/backend/sglang/rtn_w8a8.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ eval:
bs: 1
seq_len: 2048
inference_per_block: False
eval_token_consist: True
save:
save_sgl: True
save_path: /path/to/save_for_sgl_rtn_w8a8/
1 change: 0 additions & 1 deletion configs/quantization/backend/vllm/gptq_w4a16.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ eval:
bs: 1
seq_len: 2048
inference_per_block: False
eval_token_consist: True
quant:
method: GPTQ
weight:
Expand Down
1 change: 0 additions & 1 deletion configs/quantization/backend/vllm/rtn_w4a16.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ eval:
bs: 1
seq_len: 2048
inference_per_block: False
eval_token_consist: True
save:
save_vllm: True
save_path: /path/to/save_for_vllm_rtn/
1 change: 0 additions & 1 deletion configs/quantization/backend/vllm/rtn_w8a16.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ eval:
bs: 1
seq_len: 2048
inference_per_block: False
eval_token_consist: True
save:
save_vllm: True
save_path: /path/to/save_for_vllm_rtn_w8a16/
1 change: 0 additions & 1 deletion configs/quantization/backend/vllm/rtn_w8a8.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ eval:
bs: 1
seq_len: 2048
inference_per_block: False
eval_token_consist: True
save:
save_vllm: True
save_path: /path/to/save_for_vllm_rtn_w8a8/
2 changes: 0 additions & 2 deletions configs/quantization/methods/AdaDim/adadim_w_a.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@ eval:
# For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
bs: 1
inference_per_block: False
# Consistency of tokens between original and fake-quantized model output.
eval_token_consist: True
quant:
method: AdaDim
weight:
Expand Down
2 changes: 0 additions & 2 deletions configs/quantization/methods/Awq/awq_w_a.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@ eval:
# For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
bs: 1
inference_per_block: False
# Consistency of tokens between original and fake-quantized model output.
eval_token_consist: True
quant:
method: Awq
weight:
Expand Down
2 changes: 0 additions & 2 deletions configs/quantization/methods/Awq/awq_w_a_chat_data.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,6 @@ eval:
# For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
bs: 1
inference_per_block: False
# Consistency of tokens between original and fake-quantized model output.
eval_token_consist: True
quant:
method: Awq
weight:
Expand Down
2 changes: 0 additions & 2 deletions configs/quantization/methods/Awq/awq_w_a_mix_bits.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@ eval:
# For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
bs: 1
inference_per_block: False
# Consistency of tokens between original and fake-quantized model output.
eval_token_consist: True
quant:
method: Awq
weight:
Expand Down
2 changes: 0 additions & 2 deletions configs/quantization/methods/Awq/awq_w_only.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@ eval:
# For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
bs: 1
inference_per_block: False
# Consistency of tokens between original and fake-quantized model output.
eval_token_consist: True
quant:
method: Awq
weight:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,14 @@ model:
tokenizer_mode: slow
torch_dtype: auto
calib:
name: alm_datastes
type: audio_txt
name: custom_mm
download: False
path: calib data path
add_answer: False
n_samples: 128
apply_chat_template: True
add_answer: True # Defalut is False. If set it to Ture, calib data will add answers.
n_samples: 8
bs: -1
seq_len: 512
preproc: alm_general
padding: True
seed: *seed
eval:
Expand All @@ -27,8 +26,6 @@ eval:
# For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
bs: 1
inference_per_block: False
# Consistency of tokens between original and fake-quantized model output.
eval_token_consist: False
quant:
method: Awq
weight:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,14 @@ model:
torch_dtype: auto
use_cpu_to_save_cuda_mem_for_catcher: False
calib:
name: avlm_datastes
type: audio_img_txt
name: custom_mm
download: False
path: calib data path
add_answer: False
n_samples: 128
apply_chat_template: True
add_answer: True # Defalut is False. If set it to Ture, calib data will add answers.
n_samples: 8
bs: -1
seq_len: 512
preproc: avlm_general
padding: True
seed: *seed
eval:
Expand All @@ -28,8 +27,6 @@ eval:
# For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
bs: 1
inference_per_block: False
# Consistency of tokens between original and fake-quantized model output.
eval_token_consist: False
quant:
method: Awq
weight:
Expand Down
47 changes: 32 additions & 15 deletions configs/quantization/methods/Awq/awq_w_only_custom_data.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,27 +6,44 @@ model:
tokenizer_mode: slow
torch_dtype: auto
calib:
name: custom
name: custom_txt
download: False
load_from_txt: True
path: ./inputs.txt
path: calib data path
apply_chat_template: True
n_samples: 128
bs: -1
seq_len: 512
bs: -1
preproc: random_truncate_txt
seed: *seed
eval:
eval_pos: [pretrain, transformed, fake_quant]
name: wikitext2
download: False
path: eval data path
seq_len: 2048
# For 7B / 13B model eval, bs can be set to "1", and inference_per_block can be set to "False".
# For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
bs: 1
inference_per_block: False
# Consistency of tokens between original and fake-quantized model output.
eval_token_consist: True
- eval_pos: [pretrain, transformed, fake_quant]
name: custom_gen
type: generate_only
max_new_tokens: 32
bs: 1
download: False
path: /data/yongyang/datasets/general_custom_data
apply_chat_template: True
inference_per_block: False
- eval_pos: [pretrain, transformed, fake_quant]
name: wikitext2
type: token_acc
download: False
path: /data/yongyang/datasets/llmc/eval/wikitext2
seq_len: 2048
# For 7B / 13B model eval, bs can be set to "1", and inference_per_block can be set to "False".
# For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
bs: 1
inference_per_block: False
- eval_pos: [pretrain, transformed, fake_quant]
name: wikitext2
download: False
path: /data/yongyang/datasets/llmc/eval/wikitext2
seq_len: 2048
# For 7B / 13B model eval, bs can be set to "1", and inference_per_block can be set to "False".
# For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
bs: 1
inference_per_block: False
quant:
method: Awq
weight:
Expand Down
110 changes: 110 additions & 0 deletions configs/quantization/methods/Awq/awq_w_only_custom_data_debug.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
base:
seed: &seed 42
model:
# type: Qwen2
# path: /data/yongyang/models/qwen25/Qwen2.5-0.5B-Instruct
type: Qwen2VL
path: /data/yongyang/models/Qwen2-VL-2B-Instruct
# type: Llava
# path: /data/yongyang/models/llava-1.5-7b-hf
# type: InternVL2
# path: /data/yongyang/models/InternVL2-2B
# type: Qwen2Audio
# path: /data/yongyang/models/Qwen2-Audio-7B-Instruct
# type: InternOmni
# path: /data/yongyang/models/InternOmni
# type: Llama
# path: /data/yongyang/models/Meta-Llama-3.1-8B-Instruct
# type: InternLM2
# path: /data/yongyang/models/internlm2-chat-1_8b
# type: DeepseekV2
# path: /data/yongyang/models/DeepSeek-V2-Lite-Chat
tokenizer_mode: fast
torch_dtype: auto
# calib:
# name: pileval
# download: False
# path: /data/yongyang/datasets/llmc/calib/pileval
# n_samples: 2
# bs: -1
# seq_len: 512
# preproc: txt_general_preproc
# seed: *seed
# calib:
# name: custom_txt
# download: False
# path: /data/yongyang/datasets/general_custom_data
# apply_chat_template: True
# n_samples: 8
# bs: -1
# padding: True
# seed: *seed
# calib:
# name: custom_txt
# download: False
# path: /data/yongyang/datasets/general_custom_data
# apply_chat_template: True
# n_samples: 8
# seq_len: 3
# bs: -1
# preproc: random_truncate_txt
# seed: *seed
calib:
name: custom_mm
download: False
path: /data/yongyang/datasets/general_custom_data
apply_chat_template: True
add_answer: True # Defalut is False. If set it to Ture, calib data will add answers.
n_samples: 8
bs: -1
seq_len: 512
padding: True
seed: *seed
eval:
- eval_pos: [pretrain, transformed, fake_quant]
name: custom_gen
type: generate_only
max_new_tokens: 32
bs: 1
download: False
path: /data/yongyang/datasets/general_custom_data
apply_chat_template: True
inference_per_block: False
- eval_pos: [pretrain, transformed, fake_quant]
name: wikitext2
type: token_acc
download: False
path: /data/yongyang/datasets/llmc/eval/wikitext2
seq_len: 2048
# For 7B / 13B model eval, bs can be set to "1", and inference_per_block can be set to "False".
# For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
bs: 1
inference_per_block: False
- eval_pos: [pretrain, transformed, fake_quant]
name: wikitext2
download: False
path: /data/yongyang/datasets/llmc/eval/wikitext2
seq_len: 2048
# For 7B / 13B model eval, bs can be set to "1", and inference_per_block can be set to "False".
# For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
bs: 1
inference_per_block: False
quant:
method: Awq
weight:
bit: 4
symmetric: False
granularity: per_group
group_size: 128
special:
trans: True
# The options for "trans_version" include "v1" and "v2".
# But their results don't differ significantly.
trans_version: v2
weight_clip: False
# For 2-bit quantization, setting "clip_sym: False" will yield better results.
clip_sym: True
save:
save_trans: False
save_fake: False
save_path: /path/to/save/
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,12 @@ model:
tokenizer_mode: slow
torch_dtype: auto
calib:
name: custom
name: custom_txt
download: False
load_from_txt: True
path: ./inputs.txt
n_samples: 128
path: calib data path
apply_chat_template: True
n_samples: 8
bs: -1
preproc: original_txt
padding: True
seed: *seed
eval:
Expand All @@ -25,8 +24,6 @@ eval:
# For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
bs: 1
inference_per_block: False
# Consistency of tokens between original and fake-quantized model output.
eval_token_consist: False
quant:
method: Awq
weight:
Expand Down
Loading

0 comments on commit dc3b11b

Please sign in to comment.