You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Copy file name to clipboardExpand all lines: CHANGELOG.md
+5
Original file line number
Diff line number
Diff line change
@@ -1,3 +1,8 @@
1
+
## 0.4.2
2
+
3
+
* Adds the ability to provide a custom system prompt to the MMLU-based evaluators. When a system prompt is provided, LM-eval applies the chat template under the hood, else it will pass the model a barebones prompt.
4
+
* Adds an `extra_args` parameter to the `.run` method of all MMLU-based evaluators. This way, consumers are able to directly pass any additional arguments they want through to the `lm_eval.evaluators.simple_evaluate` function.
5
+
1
6
## 0.4
2
7
3
8
* Added ability to specify a custom http client to MT-Bench
Copy file name to clipboardExpand all lines: scripts/test_mmlu.py
+59-2
Original file line number
Diff line number
Diff line change
@@ -1,16 +1,73 @@
1
+
# Standard
2
+
fromtypingimportDict, List, Tuple, TypedDict
3
+
1
4
# First Party
2
5
frominstructlab.eval.mmluimportMMLUEvaluator
3
6
7
+
SYSTEM_PROMPT="""I am, Red Hat® Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant."""
8
+
9
+
10
+
classMMLUSample(TypedDict):
11
+
"""
12
+
Example of a single sample returned from lm_eval when running MMLU.
13
+
This is not a comprehensive type, just the subset of fields we care about for this test.
14
+
"""
15
+
16
+
# Arguments is the list of (prompt, answer) pairs passed to MMLU as few-shot samples.
17
+
# They will not be present with few_shot=0
18
+
arguments: List[Tuple[str, str]]
19
+
20
+
21
+
defall_samples_contain_system_prompt(
22
+
samples: Dict[str, List[MMLUSample]], prompt: str
23
+
) ->bool:
24
+
"""
25
+
Given a mapping of evaluation --> list of results, validates that all few-shot examples
26
+
included the system prompt
27
+
"""
28
+
fortopic, samples_setinsamples.items():
29
+
forsampleinsamples_set:
30
+
formmlu_prompt, _insample["arguments"]:
31
+
ifpromptnotinmmlu_prompt:
32
+
# we are looking for the exact system prompt, so no need to convert to normalize to lowercase
33
+
print(f"found a sample in the '{topic}' MMLU topic set")
@@ -102,6 +102,8 @@ class AbstractMMLUEvaluator(Evaluator):
102
102
few_shots number of examples
103
103
batch_size batch size for evaluation. Valid values are a positive integer or 'auto' to select the largest batch size that will fit in memory, or 'auto:N' to reselect the largest batch size N times'.
104
104
device PyTorch device (e.g. "cpu" or "cuda:0") for running models
105
+
system_prompt system prompt to be used when applying the chat template
106
+
results full output from the `lm_eval.evaluator.simple_evaluate` function after MMLU has run.
# This method converts general errors from simple_evaluate
184
214
# into a more user-understandable error
@@ -213,12 +243,13 @@ class MMLUEvaluator(AbstractMMLUEvaluator):
213
243
Evaluator for Massive Multitask Language Understanding (MMLU)
214
244
215
245
Attributes:
216
-
model_path absolute path to or name of a huggingface model
217
-
tasks list of tasks for MMLU to test the model with
218
-
model_dtype dtype of model when served
219
-
few_shots number of examples
220
-
batch_size batch size for evaluation. Valid values are a positive integer or 'auto' to select the largest batch size that will fit in memory, or 'auto:N' to reselect the largest batch size N times'.
221
-
device PyTorch device (e.g. "cpu" or "cuda:0") for running models
246
+
model_path absolute path to or name of a huggingface model
247
+
tasks list of tasks for MMLU to test the model with
248
+
model_dtype dtype of model when served
249
+
few_shots number of examples
250
+
batch_size batch size for evaluation. Valid values are a positive integer or 'auto' to select the largest batch size that will fit in memory, or 'auto:N' to reselect the largest batch size N times'.
251
+
device PyTorch device (e.g. "cpu" or "cuda:0") for running models
252
+
system_prompt system prompt to be used when applying the chat template
0 commit comments