Skip to content

Commit 7dcbe39

Browse files
ikawrakowKawrakow
andauthored
Add ability to evauate multiple choice tasks (ggml-org#5047)
* TruthfulQA: 1st attempt, does not look like it is working The same implementation can be used for HellaSwag as well, so I converted a HellaSwag validation dataset to the binary format used here and tested with that. The score is only around 50, so something is not quite right. * TruthfulQA: works but the result is bad I know it works because if I convert the HellaSwag validation data to the binary format used in the truthful_qa_score() function I get the exact same result as from the hellaswag_score() function. But I guess, the questions are tricky and the way I have done the combination of question + answer is very likely not the best. The TruthfulQA validation dataset contains 817 questions, with random chance result around 19%. With this version I get 29.1% for Mistral-7B and 55.2% for Mistral-7B-Instruct-v0.2. The HF leader board results for these two models are 42.2% and 68.3%, respectively. * TruthfulQA: fix random sample * TruthfulQA: prepare tasks in parallel for large test datasets * Rename truthful_qa to multiple_choice * Make MSVC happy I had forgotten that MSVC does not make constexpr's available inside a lambda. --------- Co-authored-by: Iwan Kawrakow <[email protected]>
1 parent 726c0fa commit 7dcbe39

File tree

3 files changed

+422
-3
lines changed

3 files changed

+422
-3
lines changed

common/common.cpp

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,25 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
203203
params.prompt_cache_all = true;
204204
} else if (arg == "--prompt-cache-ro") {
205205
params.prompt_cache_ro = true;
206+
} else if (arg == "-bf" || arg == "--binary-file") {
207+
if (++i >= argc) {
208+
invalid_param = true;
209+
break;
210+
}
211+
std::ifstream file(argv[i], std::ios::binary);
212+
if (!file) {
213+
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
214+
invalid_param = true;
215+
break;
216+
}
217+
// store the external file name in params
218+
params.prompt_file = argv[i];
219+
file.seekg(0, std::ios::end);
220+
size_t size = file.tellg();
221+
file.seekg(0, std::ios::beg);
222+
params.prompt.resize(size);
223+
file.read((char *)params.prompt.data(), size);
224+
fprintf(stderr, "Read %zu bytes from binary file %s\n", size, argv[i]);
206225
} else if (arg == "-f" || arg == "--file") {
207226
if (++i >= argc) {
208227
invalid_param = true;
@@ -689,6 +708,14 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
689708
break;
690709
}
691710
params.winogrande_tasks = std::stoi(argv[i]);
711+
} else if (arg == "--multiple-choice") {
712+
params.multiple_choice = true;
713+
} else if (arg == "--multiple-choice-tasks") {
714+
if (++i >= argc) {
715+
invalid_param = true;
716+
break;
717+
}
718+
params.multiple_choice_tasks = std::stoi(argv[i]);
692719
} else if (arg == "--ignore-eos") {
693720
params.ignore_eos = true;
694721
} else if (arg == "--no-penalize-nl") {
@@ -888,6 +915,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
888915
printf(" --in-suffix STRING string to suffix after user inputs with (default: empty)\n");
889916
printf(" -f FNAME, --file FNAME\n");
890917
printf(" prompt file to start generation.\n");
918+
printf(" -bf FNAME, --binary-file FNAME\n");
919+
printf(" binary file containing multiple choice tasks.\n");
891920
printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
892921
printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
893922
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
@@ -936,6 +965,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
936965
printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
937966
printf(" --winogrande compute Winogrande score over random tasks from datafile supplied with -f\n");
938967
printf(" --winogrande-tasks N number of tasks to use when computing the Winogrande score (default: %zu)\n", params.winogrande_tasks);
968+
printf(" --multiple-choice compute multiple choice score over random tasks from datafile supplied with -f\n");
969+
printf(" --multiple-choice-tasks N number of tasks to use when computing the multiple choice score (default: %zu)\n", params.winogrande_tasks);
939970
printf(" --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
940971
printf(" --draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft);
941972
printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);

common/common.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,9 @@ struct gpt_params {
108108
bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt
109109
size_t winogrande_tasks= 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
110110

111+
bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
112+
size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
113+
111114
bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS
112115
bool random_prompt = false; // do not randomize prompt if none provided
113116
bool use_color = false; // use color to distinguish generations and inputs

0 commit comments

Comments
 (0)