Skip to content

Commit fc2557d

Browse files
πŸ”„ synced file(s) with mlcommons/power-dev (mlcommons#1990)
* πŸ”„ synced local 'tools/submission/power/power_checker.py' with remote 'compliance/check.py'
1 parent 7fe4741 commit fc2557d

File tree

7 files changed

+90
-89
lines changed

7 files changed

+90
-89
lines changed

β€ŽREADME.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ Please see the [MLPerf Inference benchmark paper](https://arxiv.org/abs/1911.025
1313
primaryClass={cs.LG}
1414
}
1515
```
16+
1617
Please see [here](https://docs.mlcommons.org/inference/benchmarks/) for the MLPerf inference documentation website which includes automated commands to run MLPerf inference benchmarks using different implementations.
1718

1819
## MLPerf Inference v4.1 (submission deadline July 26, 2024)

β€Žlanguage/mixtral-8x7b/standalone_infer/hf_eval_all.py

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,8 @@ def run_infer(df, ckpt_path, bs):
4242
# Load the model from local if possible.
4343
model_path = Path(ckpt_path)
4444
if not model_path.exists():
45-
raise RuntimeError(f"{ckpt_path} not existed. Please download the checkpoint from mlcommon")
45+
raise RuntimeError(
46+
f"{ckpt_path} not existed. Please download the checkpoint from mlcommon")
4647

4748
tokenizer = AutoTokenizer.from_pretrained(
4849
model_path, padding_side="left", trust_remote_code=True)
@@ -51,7 +52,8 @@ def run_infer(df, ckpt_path, bs):
5152
tokenizer.pad_token = tokenizer.eos_token
5253
tokenizer.pad_token_id = tokenizer.eos_token_id
5354

54-
# gen parameter. We stop at 1024. Starting from v5.0, min_token is set to 2 to avoid 0-output issue
55+
# gen parameter. We stop at 1024. Starting from v5.0, min_token is set to
56+
# 2 to avoid 0-output issue
5557
gen_kwargs = {
5658
# "min_new_tokens": 1,
5759
"min_new_tokens": 2,
@@ -80,9 +82,11 @@ def run_infer(df, ckpt_path, bs):
8082
eidx = min(sidx + BS, len(df))
8183

8284
# We use batch_encode_plus for batch inference.
83-
# Note 9/29/2024: Mixtral changed its tokenizer in Jun. Using the Feb 29 2024 version.
85+
# Note 9/29/2024: Mixtral changed its tokenizer in Jun. Using the Feb
86+
# 29 2024 version.
8487
batch_texts = df['input'][sidx:eidx].tolist()
85-
batch_ids = tokenizer.batch_encode_plus(batch_texts, return_tensors="pt", padding=True)
88+
batch_ids = tokenizer.batch_encode_plus(
89+
batch_texts, return_tensors="pt", padding=True)
8690
# tok_input_length = batch_ids['attention_mask'].sum(
8791
# axis=1).to(torch.int32).tolist()
8892
# input_tokens_lens += tok_input_length
@@ -97,7 +101,7 @@ def run_infer(df, ckpt_path, bs):
97101
batch_ids = batch_ids.to(device)
98102
_, length = batch_ids.input_ids.shape
99103
outputs = model.generate(**batch_ids, num_return_sequences=1,
100-
**gen_kwargs)
104+
**gen_kwargs)
101105

102106
output_ids = outputs[:, length:].cpu().tolist()
103107
output_tokens += output_ids
@@ -126,6 +130,7 @@ def run_infer(df, ckpt_path, bs):
126130

127131
return output_df
128132

133+
129134
def trim_twos(df):
130135
# Remove all trailing 2s except for 1
131136
def remove_trailing_twos(lst):
@@ -137,21 +142,25 @@ def remove_trailing_twos(lst):
137142
break
138143
return lst[:-count] if count > 0 else lst
139144

140-
df['infer_tok_ref_output'] = df['infer_tok_ref_output'].apply(remove_trailing_twos)
145+
df['infer_tok_ref_output'] = df['infer_tok_ref_output'].apply(
146+
remove_trailing_twos)
141147
df['trim_lengths'] = df['infer_tok_ref_output'].apply(len)
142148
df['tok_ref_output'] = df['tok_ref_output'].apply(remove_trailing_twos)
143149
df['tok_ref_output_len'] = df['tok_ref_output'].apply(len)
144150
return df
145151

152+
146153
def mbxp_stop(df):
147154
stop_tokens = [13, 13940, 28832, 13]
155+
148156
def modify_list(lst):
149157
for i in range(len(lst) - len(stop_tokens) + 1):
150-
if lst[i:i+len(stop_tokens)] == stop_tokens:
151-
return lst[:i+len(stop_tokens)]
158+
if lst[i:i + len(stop_tokens)] == stop_tokens:
159+
return lst[:i + len(stop_tokens)]
152160
return lst
153161

154-
df.loc[df['dataset'] == 'MBXP', 'infer_tok_ref_output'] = df[df['dataset'] == 'MBXP']['infer_tok_ref_output'].apply(modify_list)
162+
df.loc[df['dataset'] == 'MBXP', 'infer_tok_ref_output'] = df[df['dataset']
163+
== 'MBXP']['infer_tok_ref_output'].apply(modify_list)
155164
df['trim_lengths'] = df['infer_tok_ref_output'].apply(len)
156165
return df
157166

@@ -190,5 +199,3 @@ def fix_name(df):
190199
df = fix_name(df)
191200

192201
df.to_pickle(args.output_pkl)
193-
194-

β€Žlanguage/mixtral-8x7b/standalone_infer/run_accuracy.py

Lines changed: 40 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,8 @@ def calculate_rouge_score(model_outputs, ref_outputs):
5959
m_result = metric.compute(
6060
predictions=m_preds, references=m_targets, use_stemmer=True, use_aggregator=False
6161
)
62-
m_rouge_result = {k: round(np.mean(v) * 100, 4) for k, v in m_result.items()}
62+
m_rouge_result = {k: round(np.mean(v) * 100, 4)
63+
for k, v in m_result.items()}
6364

6465
return m_rouge_result
6566

@@ -101,30 +102,35 @@ def maybe_remove_comma(x: str) -> str:
101102
def try_float(x: str):
102103
try:
103104
ret = float(x)
104-
except:
105+
except BaseException:
105106
ret = None
106107
return ret
107108

109+
108110
def postprocess_golang(code: str) -> str:
109-
multi_line_imports = re.compile(r"^import \(\n(.+)((?:\n.+)+)\n\)", re.MULTILINE)
111+
multi_line_imports = re.compile(
112+
r"^import \(\n(.+)((?:\n.+)+)\n\)", re.MULTILINE)
110113
line_imports = re.compile(r"^import \".*\"")
111114
func_main = re.compile(r"^func main.*^}", re.MULTILINE | re.DOTALL)
112115

113-
code = code.replace("package main", "") # Remove package main
116+
code = code.replace("package main", "") # Remove package main
114117
code = multi_line_imports.sub("", code)
115118
code = line_imports.sub("", code)
116119
code = func_main.sub("", code)
117120

118121
return code
119122

123+
120124
def postprocess_scala(code: str) -> str:
121125
code = code.replace("object Main extends App {", "")
122126
code = "".join(code.splitlines(True)[:-1])
123127
return code
124128

129+
125130
def postprocess_python(code: str) -> str:
126131
return code.lstrip()
127132

133+
128134
def worker(inp_queue, out_queue):
129135
while True:
130136
try:
@@ -143,7 +149,7 @@ def worker(inp_queue, out_queue):
143149
try:
144150
solution = solution[:solution.index("```")]
145151
except ValueError:
146-
#Happens when a code block isn't closed properly
152+
# Happens when a code block isn't closed properly
147153
pass
148154

149155
if problem["lang"] == "go":
@@ -153,15 +159,22 @@ def worker(inp_queue, out_queue):
153159
elif problem["lang"] == "scala":
154160
solution = postprocess_scala(solution)
155161

156-
# Mixtral likes escaping underscores for some reason, so let's remove these
157-
solution = solution.replace("\_", "_")
162+
# Mixtral likes escaping underscores for some reason, so let's remove
163+
# these
164+
solution = solution.replace("\\_", "_")
158165

159166
# The evaluation script evaluates `code = prompt + solution + tests`
160-
# But Mixtral regenerates the prompt in its output, so we should remove this
167+
# But Mixtral regenerates the prompt in its output, so we should remove
168+
# this
161169
problem["prompt"] = ""
162170

163171
result = checker(problem, solution, timeout=20.0)
164-
out_queue.put((key, problem["lang"], result["passed"], result["result"], problem["response"]))
172+
out_queue.put(
173+
(key,
174+
problem["lang"],
175+
result["passed"],
176+
result["result"],
177+
problem["response"]))
165178

166179

167180
def convert_pickle(df: pd.DataFrame, result_keys: dict):
@@ -193,7 +206,8 @@ def evaluate_mbxp(n_works: int, df: pd.DataFrame, result_keys: dict):
193206
n_problems = 0
194207

195208
for lang, problems in by_lang.items():
196-
if lang not in ["cpp", "python", "php", "javascript", "ruby", "typescript"]:
209+
if lang not in ["cpp", "python", "php",
210+
"javascript", "ruby", "typescript"]:
197211
raise RuntimeError(f"{lang} not in supported list.")
198212

199213
n_problems += len(problems)
@@ -213,7 +227,10 @@ def evaluate_mbxp(n_works: int, df: pd.DataFrame, result_keys: dict):
213227
lang_counts = {}
214228
for i in tqdm(range(n_problems)):
215229
key, lang, passed, result, response = out_queue.get()
216-
passes[key] = {"passed": passed, "result": result, "response": response}
230+
passes[key] = {
231+
"passed": passed,
232+
"result": result,
233+
"response": response}
217234
n_passed += passed
218235

219236
lang_passed.setdefault(lang, 0)
@@ -244,7 +261,8 @@ def evaluate_openorca(df: pd.DataFrame, result_keys: dict):
244261
score = calculate_rouge_score(gen_output, gt_output)
245262
gen_token_len = df[result_keys['length']].tolist()
246263
gen_token_per_sample = sum(gen_token_len) / len(gen_token_len)
247-
print(f"OpenOrca score: {score}, gen_token_per_sample: {gen_token_per_sample}")
264+
print(
265+
f"OpenOrca score: {score}, gen_token_per_sample: {gen_token_per_sample}")
248266
return score
249267

250268

@@ -266,13 +284,18 @@ def evaluate_gsm8k(df: pd.DataFrame, result_keys: dict):
266284
em = correct / total
267285
gen_token_len = df[result_keys['length']].tolist()
268286
gen_token_per_sample = sum(gen_token_len) / len(gen_token_len)
269-
print(f"EM: {em}, correct: {correct} / {total}, gen_token_per_sample: {gen_token_per_sample}")
287+
print(
288+
f"EM: {em}, correct: {correct} / {total}, gen_token_per_sample: {gen_token_per_sample}")
270289
return em
271290

272291

273292
if __name__ == "__main__":
274293
parser = argparse.ArgumentParser()
275-
parser.add_argument("--n_workers", type=int, default=10, help="The number of processes to use")
294+
parser.add_argument(
295+
"--n_workers",
296+
type=int,
297+
default=10,
298+
help="The number of processes to use")
276299
parser.add_argument("--results_path", type=str, default="mixtral_8x7b_15000_greedy_reference_fp16_mintoken2.pkl",
277300
help="The path to the results file pickle file")
278301
parser.add_argument("--result_key", type=str, default="ref_output",
@@ -307,9 +330,9 @@ def evaluate_gsm8k(df: pd.DataFrame, result_keys: dict):
307330
"""
308331

309332
df = pd.read_pickle(args.results_path)
310-
df_gsm8k = df[df['dataset']=="GSM8K"].copy()
333+
df_gsm8k = df[df['dataset'] == "GSM8K"].copy()
311334
evaluate_gsm8k(df_gsm8k, result_keys)
312-
df_openorca = df[df['dataset']=="OpenOrca"].copy()
335+
df_openorca = df[df['dataset'] == "OpenOrca"].copy()
313336
evaluate_openorca(df_openorca, result_keys)
314-
df_mbxp = df[df['dataset']=="MBXP"].copy()
337+
df_mbxp = df[df['dataset'] == "MBXP"].copy()
315338
evaluate_mbxp(args.n_workers, df_mbxp, result_keys)

β€Žloadgen/loadgen.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -323,7 +323,8 @@ std::vector<QueryMetadata> GenerateQueries(
323323
size_t pad_size =
324324
(loaded_samples.size() - samples_per_query % loaded_samples.size());
325325
samples_per_query += pad_size;
326-
} else if ((scenario != TestScenario::Offline) && (min_queries % loaded_samples.size() != 0)) {
326+
} else if ((scenario != TestScenario::Offline) &&
327+
(min_queries % loaded_samples.size() != 0)) {
327328
// In Server, SingleStream, MultiStream mode, the min_queries should be
328329
// padded
329330
size_t pad_size =

β€Žloadgen/test_settings_internal.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -757,8 +757,8 @@ int TestSettings::FromConfig(const std::string &path, const std::string &model,
757757
&performance_issue_same_index, nullptr);
758758

759759
if (lookupkv(model, scenario, "sample_concatenate_permutation", &val,
760-
nullptr))
761-
sample_concatenate_permutation = (val == 1) ? true : false;
760+
nullptr))
761+
sample_concatenate_permutation = (val == 1) ? true : false;
762762
if (lookupkv(model, "Server", "coalesce_queries", &val, nullptr))
763763
server_coalesce_queries = (val == 0) ? false : true;
764764
if (lookupkv(model, "Server", "max_async_queries", &val, nullptr))

0 commit comments

Comments
Β (0)