ModelTC · llmc-reviewer · Dec 13, 2024 · Dec 13, 2024
diff --git a/configs/quantization/backend/autoawq/rtn_w4a16.yml b/configs/quantization/backend/autoawq/rtn_w4a16.yml
@@ -22,7 +22,6 @@ eval:
     bs: 1
     seq_len: 2048
     inference_per_block: False
-    eval_token_consist: True
 save:
     save_autoawq: True
     save_path: /path/to/save_for_autoawq_rtn_w4/
diff --git a/configs/quantization/backend/mlcllm/gptq_w4a16.yml b/configs/quantization/backend/mlcllm/gptq_w4a16.yml
@@ -22,7 +22,6 @@ eval:
     bs: 1
     seq_len: 2048
     inference_per_block: False
-    eval_token_consist: True
 quant:
     method: GPTQ
     weight:

diff --git a/configs/quantization/backend/mlcllm/rtn_w4a16.yml b/configs/quantization/backend/mlcllm/rtn_w4a16.yml
@@ -22,7 +22,6 @@ eval:
     bs: 1
     seq_len: 2048
     inference_per_block: False
-    eval_token_consist: True
 save:
     save_mlcllm: True
     save_path: /path/to/save_for_mlcllm_rtn_w4/
diff --git a/configs/quantization/backend/sglang/gptq_w4a16.yml b/configs/quantization/backend/sglang/gptq_w4a16.yml
@@ -21,7 +21,6 @@ eval:
     bs: 1
     seq_len: 2048
     inference_per_block: False
-    eval_token_consist: True
 quant:
     method: GPTQ
     weight:

diff --git a/configs/quantization/backend/sglang/rtn_w4a16.yml b/configs/quantization/backend/sglang/rtn_w4a16.yml
@@ -20,7 +20,6 @@ eval:
     bs: 1
     seq_len: 2048
     inference_per_block: False
-    eval_token_consist: True
 save:
     save_sgl: True
     save_path: /path/to/save_for_sgl_rtn/
diff --git a/configs/quantization/backend/sglang/rtn_w8a16.yml b/configs/quantization/backend/sglang/rtn_w8a16.yml
@@ -20,7 +20,6 @@ eval:
     bs: 1
     seq_len: 2048
     inference_per_block: False
-    eval_token_consist: True
 save:
     save_sgl: True
     save_path: /path/to/save_for_sgl_rtn_w8a16/
diff --git a/configs/quantization/backend/sglang/rtn_w8a8.yml b/configs/quantization/backend/sglang/rtn_w8a8.yml
@@ -23,7 +23,6 @@ eval:
     bs: 1
     seq_len: 2048
     inference_per_block: False
-    eval_token_consist: True
 save:
     save_sgl: True
     save_path: /path/to/save_for_sgl_rtn_w8a8/
diff --git a/configs/quantization/backend/vllm/gptq_w4a16.yml b/configs/quantization/backend/vllm/gptq_w4a16.yml
@@ -21,7 +21,6 @@ eval:
     bs: 1
     seq_len: 2048
     inference_per_block: False
-    eval_token_consist: True
 quant:
     method: GPTQ
     weight:

diff --git a/configs/quantization/backend/vllm/rtn_w4a16.yml b/configs/quantization/backend/vllm/rtn_w4a16.yml
@@ -20,7 +20,6 @@ eval:
     bs: 1
     seq_len: 2048
     inference_per_block: False
-    eval_token_consist: True
 save:
     save_vllm: True
     save_path: /path/to/save_for_vllm_rtn/
diff --git a/configs/quantization/backend/vllm/rtn_w8a16.yml b/configs/quantization/backend/vllm/rtn_w8a16.yml
@@ -20,7 +20,6 @@ eval:
     bs: 1
     seq_len: 2048
     inference_per_block: False
-    eval_token_consist: True
 save:
     save_vllm: True
     save_path: /path/to/save_for_vllm_rtn_w8a16/
diff --git a/configs/quantization/backend/vllm/rtn_w8a8.yml b/configs/quantization/backend/vllm/rtn_w8a8.yml
@@ -23,7 +23,6 @@ eval:
     bs: 1
     seq_len: 2048
     inference_per_block: False
-    eval_token_consist: True
 save:
     save_vllm: True
     save_path: /path/to/save_for_vllm_rtn_w8a8/
diff --git a/configs/quantization/methods/AdaDim/adadim_w_a.yml b/configs/quantization/methods/AdaDim/adadim_w_a.yml
@@ -24,8 +24,6 @@ eval:
     # For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
     bs: 1
     inference_per_block: False
-    # Consistency of tokens between original and fake-quantized model output.
-    eval_token_consist: True
 quant:
     method: AdaDim
     weight:

diff --git a/configs/quantization/methods/Awq/awq_w_a.yml b/configs/quantization/methods/Awq/awq_w_a.yml
@@ -24,8 +24,6 @@ eval:
     # For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
     bs: 1
     inference_per_block: False
-    # Consistency of tokens between original and fake-quantized model output.
-    eval_token_consist: True
 quant:
     method: Awq
     weight:

diff --git a/configs/quantization/methods/Awq/awq_w_a_chat_data.yml b/configs/quantization/methods/Awq/awq_w_a_chat_data.yml
@@ -25,8 +25,6 @@ eval:
     # For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
     bs: 1
     inference_per_block: False
-    # Consistency of tokens between original and fake-quantized model output.
-    eval_token_consist: True
 quant:
     method: Awq
     weight:

diff --git a/configs/quantization/methods/Awq/awq_w_a_mix_bits.yml b/configs/quantization/methods/Awq/awq_w_a_mix_bits.yml
@@ -24,8 +24,6 @@ eval:
     # For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
     bs: 1
     inference_per_block: False
-    # Consistency of tokens between original and fake-quantized model output.
-    eval_token_consist: True
 quant:
     method: Awq
     weight:

diff --git a/configs/quantization/methods/Awq/awq_w_only.yml b/configs/quantization/methods/Awq/awq_w_only.yml
@@ -24,8 +24,6 @@ eval:
     # For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
     bs: 1
     inference_per_block: False
-    # Consistency of tokens between original and fake-quantized model output.
-    eval_token_consist: True
 quant:
     method: Awq
     weight:

diff --git a/configs/quantization/methods/Awq/awq_w_only_custom_alm_data_padding.yml b/configs/quantization/methods/Awq/awq_w_only_custom_alm_data_padding.yml
@@ -6,15 +6,14 @@ model:
     tokenizer_mode: slow
     torch_dtype: auto
 calib:
-    name: alm_datastes
-    type: audio_txt
+    name: custom_mm
     download: False
     path: calib data path
-    add_answer: False
-    n_samples: 128
+    apply_chat_template: True
+    add_answer: True # Defalut is False. If set it to Ture, calib data will add answers.
+    n_samples: 8
     bs: -1
     seq_len: 512
-    preproc: alm_general
     padding: True
     seed: *seed
 eval:
@@ -27,8 +26,6 @@ eval:
     # For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
     bs: 1
     inference_per_block: False
-    # Consistency of tokens between original and fake-quantized model output.
-    eval_token_consist: False
 quant:
     method: Awq
     weight:

diff --git a/configs/quantization/methods/Awq/awq_w_only_custom_avlm_data_padding.yml b/configs/quantization/methods/Awq/awq_w_only_custom_avlm_data_padding.yml
@@ -7,15 +7,14 @@ model:
     torch_dtype: auto
     use_cpu_to_save_cuda_mem_for_catcher: False
 calib:
-    name: avlm_datastes
-    type: audio_img_txt
+    name: custom_mm
     download: False
     path: calib data path
-    add_answer: False
-    n_samples: 128
+    apply_chat_template: True
+    add_answer: True # Defalut is False. If set it to Ture, calib data will add answers.
+    n_samples: 8
     bs: -1
     seq_len: 512
-    preproc: avlm_general
     padding: True
     seed: *seed
 eval:
@@ -28,8 +27,6 @@ eval:
     # For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
     bs: 1
     inference_per_block: False
-    # Consistency of tokens between original and fake-quantized model output.
-    eval_token_consist: False
 quant:
     method: Awq
     weight:

diff --git a/configs/quantization/methods/Awq/awq_w_only_custom_data.yml b/configs/quantization/methods/Awq/awq_w_only_custom_data.yml
@@ -6,27 +6,44 @@ model:
     tokenizer_mode: slow
     torch_dtype: auto
 calib:
-    name: custom
+    name: custom_txt
     download: False
-    load_from_txt: True
-    path: ./inputs.txt
+    path: calib data path
+    apply_chat_template: True
     n_samples: 128
-    bs: -1
     seq_len: 512
+    bs: -1
     preproc: random_truncate_txt
     seed: *seed
 eval:
-    eval_pos: [pretrain, transformed, fake_quant]
-    name: wikitext2
-    download: False
-    path: eval data path
-    seq_len: 2048
-    # For 7B / 13B model eval, bs can be set to "1", and inference_per_block can be set to "False".
-    # For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
-    bs: 1
-    inference_per_block: False
-    # Consistency of tokens between original and fake-quantized model output.
-    eval_token_consist: True
+    - eval_pos: [pretrain, transformed, fake_quant]
+      name: custom_gen
+      type: generate_only
+      max_new_tokens: 32
+      bs: 1
+      download: False
+      path: /data/yongyang/datasets/general_custom_data
+      apply_chat_template: True
+      inference_per_block: False
+    - eval_pos: [pretrain, transformed, fake_quant]
+      name: wikitext2
+      type: token_acc
+      download: False
+      path: /data/yongyang/datasets/llmc/eval/wikitext2
+      seq_len: 2048
+      # For 7B / 13B model eval, bs can be set to "1", and inference_per_block can be set to "False".
+      # For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
+      bs: 1
+      inference_per_block: False
+    - eval_pos: [pretrain, transformed, fake_quant]
+      name: wikitext2
+      download: False
+      path: /data/yongyang/datasets/llmc/eval/wikitext2
+      seq_len: 2048
+      # For 7B / 13B model eval, bs can be set to "1", and inference_per_block can be set to "False".
+      # For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
+      bs: 1
+      inference_per_block: False
 quant:
     method: Awq
     weight:

diff --git a/configs/quantization/methods/Awq/awq_w_only_custom_data_debug.yml b/configs/quantization/methods/Awq/awq_w_only_custom_data_debug.yml
@@ -0,0 +1,110 @@
+base:
+    seed: &seed 42
+model:
+    # type: Qwen2
+    # path: /data/yongyang/models/qwen25/Qwen2.5-0.5B-Instruct
+    type: Qwen2VL
+    path: /data/yongyang/models/Qwen2-VL-2B-Instruct
+    # type: Llava
+    # path: /data/yongyang/models/llava-1.5-7b-hf
+    # type: InternVL2
+    # path: /data/yongyang/models/InternVL2-2B
+    # type: Qwen2Audio
+    # path: /data/yongyang/models/Qwen2-Audio-7B-Instruct
+    # type: InternOmni
+    # path: /data/yongyang/models/InternOmni
+    # type: Llama
+    # path: /data/yongyang/models/Meta-Llama-3.1-8B-Instruct
+    # type: InternLM2
+    # path: /data/yongyang/models/internlm2-chat-1_8b
+    # type: DeepseekV2
+    # path: /data/yongyang/models/DeepSeek-V2-Lite-Chat
+    tokenizer_mode: fast
+    torch_dtype: auto
+# calib:
+#     name: pileval
+#     download: False
+#     path: /data/yongyang/datasets/llmc/calib/pileval
+#     n_samples: 2
+#     bs: -1
+#     seq_len: 512
+#     preproc: txt_general_preproc
+#     seed: *seed
+# calib:
+#     name: custom_txt
+#     download: False
+#     path: /data/yongyang/datasets/general_custom_data
+#     apply_chat_template: True
+#     n_samples: 8
+#     bs: -1
+#     padding: True
+#     seed: *seed
+# calib:
+#     name: custom_txt
+#     download: False
+#     path: /data/yongyang/datasets/general_custom_data
+#     apply_chat_template: True
+#     n_samples: 8
+#     seq_len: 3
+#     bs: -1
+#     preproc: random_truncate_txt
+#     seed: *seed
+calib:
+    name: custom_mm
+    download: False
+    path: /data/yongyang/datasets/general_custom_data
+    apply_chat_template: True
+    add_answer: True # Defalut is False. If set it to Ture, calib data will add answers.
+    n_samples: 8
+    bs: -1
+    seq_len: 512
+    padding: True
+    seed: *seed
+eval:
+    - eval_pos: [pretrain, transformed, fake_quant]
+      name: custom_gen
+      type: generate_only
+      max_new_tokens: 32
+      bs: 1
+      download: False
+      path: /data/yongyang/datasets/general_custom_data
+      apply_chat_template: True
+      inference_per_block: False
+    - eval_pos: [pretrain, transformed, fake_quant]
+      name: wikitext2
+      type: token_acc
+      download: False
+      path: /data/yongyang/datasets/llmc/eval/wikitext2
+      seq_len: 2048
+      # For 7B / 13B model eval, bs can be set to "1", and inference_per_block can be set to "False".
+      # For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
+      bs: 1
+      inference_per_block: False
+    - eval_pos: [pretrain, transformed, fake_quant]
+      name: wikitext2
+      download: False
+      path: /data/yongyang/datasets/llmc/eval/wikitext2
+      seq_len: 2048
+      # For 7B / 13B model eval, bs can be set to "1", and inference_per_block can be set to "False".
+      # For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
+      bs: 1
+      inference_per_block: False
+quant:
+    method: Awq
+    weight:
+        bit: 4
+        symmetric: False
+        granularity: per_group
+        group_size: 128
+    special:
+        trans: True
+        # The options for "trans_version" include "v1" and "v2".
+        # But their results don't differ significantly.
+        trans_version: v2
+        weight_clip: False
+        # For 2-bit quantization, setting "clip_sym: False" will yield better results.
+        clip_sym: True
+save:
+    save_trans: False
+    save_fake: False
+    save_path: /path/to/save/
diff --git a/configs/quantization/methods/Awq/awq_w_only_custom_data_padding.yml b/configs/quantization/methods/Awq/awq_w_only_custom_data_padding.yml
@@ -6,13 +6,12 @@ model:
     tokenizer_mode: slow
     torch_dtype: auto
 calib:
-    name: custom
+    name: custom_txt
     download: False
-    load_from_txt: True
-    path: ./inputs.txt
-    n_samples: 128
+    path: calib data path
+    apply_chat_template: True
+    n_samples: 8
     bs: -1
-    preproc: original_txt
     padding: True
     seed: *seed
 eval:
@@ -25,8 +24,6 @@ eval:
     # For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
     bs: 1
     inference_per_block: False
-    # Consistency of tokens between original and fake-quantized model output.
-    eval_token_consist: False
 quant:
     method: Awq
     weight: