Add ability to evauate multiple choice tasks (ggml-org#5047)

ikawrakow · Kawrakow · web-flow · commit 7dcbe39d36b7 · 2024-01-21T14:42:44.000+02:00
* TruthfulQA: 1st attempt, does not look like it is working

The same implementation can be used for HellaSwag as well,
so I converted a HellaSwag validation dataset to the binary
format used here and tested with that. The score is only
around 50, so something is not quite right.

* TruthfulQA: works but the result is bad

I know it works because if I convert the HellaSwag validation
data to the binary format used in the truthful_qa_score() function
I get the exact same result as from the hellaswag_score() function.
But I guess, the questions are tricky and the way I have done
the combination of question + answer is very likely not the best.
The TruthfulQA validation dataset contains 817 questions, with
random chance result around 19%. With this version I get
29.1% for Mistral-7B and 55.2% for Mistral-7B-Instruct-v0.2.
The HF leader board results for these two models are
42.2% and 68.3%, respectively.

* TruthfulQA: fix random sample

* TruthfulQA: prepare tasks in parallel for large test datasets

* Rename truthful_qa to multiple_choice

* Make MSVC happy

I had forgotten that MSVC does not make constexpr's available
inside a lambda.

---------

Co-authored-by: Iwan Kawrakow &lt;iwan.kawrakow@gmail.com&gt;
diff --git a/common/common.cpp b/common/common.cpp
@@ -203,6 +203,25 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
             params.prompt_cache_all = true;
         } else if (arg == "--prompt-cache-ro") {
             params.prompt_cache_ro = true;
+        } else if (arg == "-bf" || arg == "--binary-file") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            std::ifstream file(argv[i], std::ios::binary);
+            if (!file) {
+                fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
+                invalid_param = true;
+                break;
+            }
+            // store the external file name in params
+            params.prompt_file = argv[i];
+            file.seekg(0, std::ios::end);
+            size_t size = file.tellg();
+            file.seekg(0, std::ios::beg);
+            params.prompt.resize(size);
+            file.read((char *)params.prompt.data(), size);
+            fprintf(stderr, "Read %zu bytes from binary file %s\n", size, argv[i]);
         } else if (arg == "-f" || arg == "--file") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -689,6 +708,14 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.winogrande_tasks = std::stoi(argv[i]);
+        } else if (arg == "--multiple-choice") {
+            params.multiple_choice = true;
+        } else if (arg == "--multiple-choice-tasks") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.multiple_choice_tasks = std::stoi(argv[i]);
         } else if (arg == "--ignore-eos") {
             params.ignore_eos = true;
         } else if (arg == "--no-penalize-nl") {
@@ -888,6 +915,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  --in-suffix STRING    string to suffix after user inputs with (default: empty)\n");
     printf("  -f FNAME, --file FNAME\n");
     printf("                        prompt file to start generation.\n");
+    printf("  -bf FNAME, --binary-file FNAME\n");
+    printf("                        binary file containing multiple choice tasks.\n");
     printf("  -n N, --n-predict N   number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
     printf("  -c N, --ctx-size N    size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
     printf("  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
@@ -936,6 +965,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  --hellaswag-tasks N   number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
     printf("  --winogrande          compute Winogrande score over random tasks from datafile supplied with -f\n");
     printf("  --winogrande-tasks N  number of tasks to use when computing the Winogrande score (default: %zu)\n", params.winogrande_tasks);
+    printf("  --multiple-choice     compute multiple choice score over random tasks from datafile supplied with -f\n");
+    printf("  --multiple-choice-tasks N number of tasks to use when computing the multiple choice score (default: %zu)\n", params.winogrande_tasks);
     printf("  --keep N              number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
     printf("  --draft N             number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft);
     printf("  --chunks N            max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
diff --git a/common/common.h b/common/common.h
@@ -108,6 +108,9 @@ struct gpt_params {
     bool   winogrande      = false; // compute Winogrande score over random tasks from datafile supplied in prompt
     size_t winogrande_tasks= 0;     // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
 
+    bool   multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
+    size_t multiple_choice_tasks = 0;     // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
+
     bool mul_mat_q         = true;  // if true, use mul_mat_q kernels instead of cuBLAS
     bool random_prompt     = false; // do not randomize prompt if none provided
     bool use_color         = false; // use color to distinguish generations and inputs
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp