From 283b39c760705b2cbaaf9f1ecd4f3820b9b13440 Mon Sep 17 00:00:00 2001
From: anandhu-eng <anandhukicks@gmail.com>
Date: Tue, 13 Aug 2024 21:13:25 +0530
Subject: [PATCH 01/27] updated vllm server run cmd

---
 main.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/main.py b/main.py
index da6a1d3e8..4705ead83 100644
--- a/main.py
+++ b/main.py
@@ -209,7 +209,9 @@ def get_inference_server_run_cmd(spaces, implementation):
             return f"""\n
 {pre_space}```bash
 {pre_space}cm run script --tags=run,vllm-server \\
-{indent}--model=nm-testing/Llama-2-70b-chat-hf-FP8 
+{indent}--model=nm-testing/Llama-2-70b-chat-hf-FP8 \\
+{indent}--vllm_model_name=nm-testing/Llama-2-70b-chat-hf-FP8 \\
+{indent}--quiet
 {pre_space}```\n"""
 
     def get_venv_command(spaces):

From 4f5cbcdf549253cae5d202d1485ab86cedc03c6e Mon Sep 17 00:00:00 2001
From: anandhu-eng <anandhukicks@gmail.com>
Date: Wed, 14 Aug 2024 13:04:31 +0530
Subject: [PATCH 02/27] benchmark page information addition

---
 docs/benchmarks/index.md  | 124 -----------------------------
 docs/index.md             | 162 +++++++++++++++++++++++++++++++++++++-
 docs/install/index.md     |   2 +-
 docs/official_gh_index.md |   1 +
 docs/submission/index.md  |   2 +-
 mkdocs.yml                |   2 +-
 6 files changed, 165 insertions(+), 128 deletions(-)
 delete mode 100644 docs/benchmarks/index.md
 mode change 120000 => 100644 docs/index.md
 create mode 120000 docs/official_gh_index.md

diff --git a/docs/benchmarks/index.md b/docs/benchmarks/index.md
deleted file mode 100644
index af8090615..000000000
--- a/docs/benchmarks/index.md
+++ /dev/null
@@ -1,124 +0,0 @@
-# MLPerf Inference Benchmarks
-
-## Overview
-This document provides details on various MLPerf Inference Benchmarks categorized by tasks, models, and datasets. Each section lists the models performing similar tasks, with details on datasets, accuracy, and server latency constraints.
-
----
-
-## 1. Image Classification
-### [ResNet50-v1.5](image_classification/resnet50.md)
-- **Dataset**: Imagenet-2012 (224x224) Validation
-  - **Size**: 50,000
-  - **QSL Size**: 1,024
-- **Reference Model Accuracy**: 76.46%
-- **Server Scenario Latency Constraint**: 15ms
-
----
-
-## 2. Text to Image
-### [Stable Diffusion](text_to_image/sdxl.md)
-- **Dataset**: Subset of Coco2014
-  - **Size**: 5,000
-  - **QSL Size**: 5,000
-- **Required Accuracy (Closed Division)**:
-  - FID: 23.01085758 ≤ FID ≤ 23.95007626
-  - CLIP: 32.68631873 ≤ CLIP ≤ 31.81331801
-
----
-
-## 3. Object Detection
-### [Retinanet](object_detection/retinanet.md)
-- **Dataset**: OpenImages
-  - **Size**: 24,781
-  - **QSL Size**: 64
-- **Reference Model Accuracy**: 0.3755 mAP
-- **Server Scenario Latency Constraint**: 100ms
-
----
-
-## 4. Medical Image Segmentation
-### [3d-unet](medical_imaging/3d-unet.md)
-- **Dataset**: KiTS2019
-  - **Size**: 42
-  - **QSL Size**: 42
-- **Reference Model Accuracy**: 0.86330 Mean DICE Score
-- **Server Scenario**: Not Applicable
-
----
-
-## 5. Language Tasks
-
-### 5.1. Question Answering
-
-### [Bert-Large](language/bert.md)
-- **Dataset**: Squad v1.1 (384 Sequence Length)
-  - **Size**: 10,833
-  - **QSL Size**: 10,833
-- **Reference Model Accuracy**: F1 Score = 90.874%
-- **Server Scenario Latency Constraint**: 130ms
-
-### [LLAMA2-70B](language/llama2-70b.md)
-- **Dataset**: OpenORCA (GPT-4 split, max_seq_len=1024)
-  - **Size**: 24,576
-  - **QSL Size**: 24,576
-- **Reference Model Accuracy**:
-  - Rouge1: 44.4312
-  - Rouge2: 22.0352
-  - RougeL: 28.6162
-  - Tokens_per_sample: 294.45
-- **Server Scenario Latency Constraint**:
-  - TTFT: 2000ms
-  - TPOT: 200ms
-
-### 5.2. Text Summarization
-
-### [GPT-J](language/gpt-j.md)
-- **Dataset**: CNN Daily Mail v3.0.0
-  - **Size**: 13,368
-  - **QSL Size**: 13,368
-- **Reference Model Accuracy**:
-  - Rouge1: 42.9865
-  - Rouge2: 20.1235
-  - RougeL: 29.9881
-  - Gen_len: 4,016,878
-- **Server Scenario Latency Constraint**: 20s
-
-### 5.3. Mixed Tasks (Question Answering, Math, and Code Generation)
-
-### [Mixtral-8x7B](language/mixtral-8x7b.md)
-- **Datasets**:
-  - OpenORCA (5k samples of GPT-4 split, max_seq_len=2048)
-  - GSM8K (5k samples of the validation split, max_seq_len=2048)
-  - MBXP (5k samples of the validation split, max_seq_len=2048)
-  - **Size**: 15,000
-  - **QSL Size**: 15,000
-- **Reference Model Accuracy**:
-  - Rouge1: 45.4911
-  - Rouge2: 23.2829
-  - RougeL: 30.3615
-  - GSM8K Accuracy: 73.78%
-  - MBXP Accuracy: 60.12%
-  - Tokens_per_sample: 294.45
-- **Server Scenario Latency Constraint**:
-  - TTFT: 2000ms
-  - TPOT: 200ms
-
----
-
-## 6. Recommendation
-### [DLRMv2](recommendation/dlrm-v2.md)
-- **Dataset**: Synthetic Multihot Criteo
-  - **Size**: 204,800
-  - **QSL Size**: 204,800
-- **Reference Model Accuracy**: AUC = 80.31%
-- **Server Scenario Latency Constraint**: 60ms
-
----
-
-### Participation Categories
-- **Datacenter Category**: All nine benchmarks can participate.
-- **Edge Category**: All benchmarks except DLRMv2, LLAMA2, and Mixtral-8x7B can participate.
-
-### High Accuracy Variants
-- **Benchmarks**: `bert`, `llama2-70b`, `dlrm_v2`, and `3d-unet`
-- **Requirement**: Must achieve at least 99.9% of the FP32 reference model accuracy, compared to the default 99% accuracy requirement.
diff --git a/docs/index.md b/docs/index.md
deleted file mode 120000
index 32d46ee88..000000000
--- a/docs/index.md
+++ /dev/null
@@ -1 +0,0 @@
-../README.md
\ No newline at end of file
diff --git a/docs/index.md b/docs/index.md
new file mode 100644
index 000000000..c77175e2f
--- /dev/null
+++ b/docs/index.md
@@ -0,0 +1,161 @@
+# MLPerf Inference Benchmarks
+
+## Overview
+This document provides details on various [MLPerf Inference Benchmarks](official_gh_index.md) categorized by tasks, models, and datasets. Each section lists the models performing similar tasks, with details on datasets, accuracy, and server latency constraints.
+
+---
+
+## 1. Image Classification
+### [ResNet50-v1.5](benchmarks/image_classification/resnet50.md)
+- **Dataset**: Imagenet-2012 (224x224) Validation
+  - **Dataset Size**: 50,000
+  - **QSL Size**: 1,024
+- **Number of Parameters**: 25,636,712
+- **Reference Model Accuracy**: 76.46% ACC
+- **Server Scenario Latency Constraint**: 15ms
+- **Equal Issue mode**: False
+- **Accuracy Variants**: None
+- **Precision**: fp32, int8
+
+---
+
+## 2. Text to Image
+### [Stable Diffusion](benchmarks/text_to_image/sdxl.md)
+- **Dataset**: Subset of Coco2014
+  - **Dataset Size**: 5,000
+  - **QSL Size**: 5,000
+- **Number of Parameters**: 890,000,000
+- **Required Accuracy (Closed Division)**:
+  - FID: 23.01085758 ≤ FID ≤ 23.95007626
+  - CLIP: 32.68631873 ≤ CLIP ≤ 31.81331801
+- **Equal Issue mode**: True
+- **Accuracy Variants**: None
+- **Precision**: fp32, fp16
+
+---
+
+## 3. Object Detection
+### [Retinanet](benchmarks/object_detection/retinanet.md)
+- **Dataset**: OpenImages
+  - **Dataset Size**: 24,781
+  - **QSL Size**: 64
+- **Number of Parameters**: 50,000,000
+- **Reference Model Accuracy**: 0.3755 mAP
+- **Server Scenario Latency Constraint**: 100ms
+- **Equal Issue mode**: False
+- **Accuracy Variants**: None
+- **Accuracy Variants**: 99% and 99.9% of reference model accuracy
+- **Precision**: fp32
+
+---
+
+## 4. Medical Image Segmentation
+### [3d-unet](benchmarks/medical_imaging/3d-unet.md)
+- **Dataset**: KiTS2019
+  - **Dataset Size**: 42
+  - **QSL Size**: 42
+- **Number of Parameters**: 8,000,000
+- **Reference Model Accuracy**: 0.86330 Mean DICE Score
+- **Server Scenario**: Not Applicable
+- **Equal Issue mode**: True
+- **Accuracy Variants**: 99% and 99.9% of reference model accuracy
+- **Precision**: fp32
+
+---
+
+## 5. Language Tasks
+
+### 5.1. Question Answering
+
+### [Bert-Large](benchmarks/language/bert.md)
+- **Dataset**: Squad v1.1 (384 Sequence Length)
+  - **Dataset Size**: 10,833
+  - **QSL Size**: 10,833
+- **Number of Parameters**: 345,000,000
+- **Reference Model Accuracy**: F1 Score = 90.874%
+- **Server Scenario Latency Constraint**: 130ms
+- **Equal Issue mode**: True
+- **Accuracy Variants**: 99% and 99.9% of reference model accuracy
+- **Precision**: fp32, int8
+
+### [LLAMA2-70B](benchmarks/language/llama2-70b.md)
+- **Dataset**: OpenORCA (GPT-4 split, max_seq_len=1024)
+  - **Dataset Size**: 24,576
+  - **QSL Size**: 24,576
+- **Number of Parameters**: 70,000,000,000
+- **Reference Model Accuracy**:
+  - Rouge1: 44.4312
+  - Rouge2: 22.0352
+  - RougeL: 28.6162
+  - Tokens_per_sample: 294.45
+- **Server Scenario Latency Constraint**:
+  - TTFT: 2000ms
+  - TPOT: 200ms
+- **Equal Issue mode**: True
+- **Accuracy Variants**: 99% and 99.9% of reference model accuracy
+- **Precision**: fp32
+
+### 5.2. Text Summarization
+
+### [GPT-J](benchmarks/language/gpt-j.md)
+- **Dataset**: CNN Daily Mail v3.0.0
+  - **Dataset Size**: 13,368
+  - **QSL Size**: 13,368
+- **Number of Parameters**: 6,000,000,000
+- **Reference Model Accuracy**:
+  - Rouge1: 42.9865
+  - Rouge2: 20.1235
+  - RougeL: 29.9881
+  - Gen_len: 4,016,878
+- **Server Scenario Latency Constraint**: 20s
+- **Equal Issue mode**: True
+- **Accuracy Variants**: 99% and 99.9% of reference model accuracy
+- **Precision**: fp32
+
+### 5.3. Mixed Tasks (Question Answering, Math, and Code Generation)
+
+### [Mixtral-8x7B](benchmarks/language/mixtral-8x7b.md)
+- **Datasets**:
+  - OpenORCA (5k samples of GPT-4 split, max_seq_len=2048)
+  - GSM8K (5k samples of the validation split, max_seq_len=2048)
+  - MBXP (5k samples of the validation split, max_seq_len=2048)
+  - **Dataset Size**: 15,000
+  - **QSL Size**: 15,000
+- **Number of Parameters**: 56,000,000,000
+- **Reference Model Accuracy**:
+  - Rouge1: 45.4911
+  - Rouge2: 23.2829
+  - RougeL: 30.3615
+  - GSM8K Accuracy: 73.78%
+  - MBXP Accuracy: 60.12%
+  - Tokens_per_sample: 294.45
+- **Server Scenario Latency Constraint**:
+  - TTFT: 2000ms
+  - TPOT: 200ms
+- **Equal Issue mode**: True
+- **Accuracy Variants**: None
+- **Precision**: fp16
+
+---
+
+## 6. Recommendation
+### [DLRMv2](benchmarks/recommendation/dlrm-v2.md)
+- **Dataset**: Synthetic Multihot Criteo
+  - **Dataset Size**: 204,800
+  - **QSL Size**: 204,800
+- **Number of Parameters**: 1,000,000,000
+- **Reference Model Accuracy**: AUC = 80.31%
+- **Server Scenario Latency Constraint**: 60ms
+- **Equal Issue mode**: False
+- **Accuracy Variants**: 99% and 99.9% of reference model accuracy
+- **Precision**: fp32
+
+---
+
+### Participation Categories
+- **Datacenter Category**: All nine benchmarks can participate.
+- **Edge Category**: All benchmarks except DLRMv2, LLAMA2, and Mixtral-8x7B can participate.
+
+### High Accuracy Variants
+- **Benchmarks**: `bert`, `llama2-70b`, `dlrm_v2`, and `3d-unet`
+- **Requirement**: Must achieve at least 99.9% of the reference model accuracy, compared to the default 99% accuracy requirement.
diff --git a/docs/install/index.md b/docs/install/index.md
index f365aade3..60377adee 100644
--- a/docs/install/index.md
+++ b/docs/install/index.md
@@ -28,4 +28,4 @@ CM needs `git`, `python3-pip` and `python3-venv` installed on your system. If an
 
 Here, repo is in the format `githubUsername@githubRepo`.
 
-Now, you are ready to use the `cm` commands to run MLPerf inference as given in the [benchmarks](../benchmarks/index.md) page
+Now, you are ready to use the `cm` commands to run MLPerf inference as given in the [benchmarks](../index.md) page
diff --git a/docs/official_gh_index.md b/docs/official_gh_index.md
new file mode 120000
index 000000000..32d46ee88
--- /dev/null
+++ b/docs/official_gh_index.md
@@ -0,0 +1 @@
+../README.md
\ No newline at end of file
diff --git a/docs/submission/index.md b/docs/submission/index.md
index e294efb71..80088d444 100644
--- a/docs/submission/index.md
+++ b/docs/submission/index.md
@@ -3,7 +3,7 @@ hide:
   - toc
 ---
 
-If you follow the `cm run` commands under the individual model pages in the [benchmarks](../benchmarks/index.md) directory, all the valid results will get aggregated to the `cm cache` folder. Once all the results across all the modelsare ready you can use the following command to generate a valid submission tree compliant with the [MLPerf requirements](https://github.com/mlcommons/policies/blob/master/submission_rules.adoc#inference-1).
+If you follow the `cm run` commands under the individual model pages in the [benchmarks](../index.md) directory, all the valid results will get aggregated to the `cm cache` folder. Once all the results across all the modelsare ready you can use the following command to generate a valid submission tree compliant with the [MLPerf requirements](https://github.com/mlcommons/policies/blob/master/submission_rules.adoc#inference-1).
 
 ## Generate actual submission tree
 
diff --git a/mkdocs.yml b/mkdocs.yml
index 90a3ce9fd..8e59acf63 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -22,7 +22,7 @@ nav:
   - Install:
     - install/index.md
   - Benchmarks:
-    - benchmarks/index.md
+    - index.md
     - Image Classification:
       - ResNet50: benchmarks/image_classification/resnet50.md
     - Text to Image:

From 8e71518726249bf51a60ea8565eff16f15e780b0 Mon Sep 17 00:00:00 2001
From: anandhu-eng <anandhukicks@gmail.com>
Date: Wed, 14 Aug 2024 13:07:10 +0530
Subject: [PATCH 03/27] fix indendation issue

---
 docs/submission/index.md | 80 ++++++++++++++++++++--------------------
 1 file changed, 40 insertions(+), 40 deletions(-)

diff --git a/docs/submission/index.md b/docs/submission/index.md
index 80088d444..ffd9371e7 100644
--- a/docs/submission/index.md
+++ b/docs/submission/index.md
@@ -11,62 +11,62 @@ If you follow the `cm run` commands under the individual model pages in the [ben
     ### Closed Edge Submission
     ```bash
        cm run script -tags=generate,inference,submission \
-      --clean \
-      --preprocess_submission=yes \
-      --run-checker \
-      --submitter=MLCommons \
-      --tar=yes \
-      --env.CM_TAR_OUTFILE=submission.tar.gz \
-      --division=closed \
-      --category=edge \
-      --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes \
-      --quiet
+          --clean \
+          --preprocess_submission=yes \
+          --run-checker \
+          --submitter=MLCommons \
+          --tar=yes \
+          --env.CM_TAR_OUTFILE=submission.tar.gz \
+          --division=closed \
+          --category=edge \
+          --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes \
+          --quiet
     ```
 
 === "Closed Datacenter"
     ### Closed Datacenter Submission
     ```bash
        cm run script -tags=generate,inference,submission \
-      --clean \
-      --preprocess_submission=yes \
-      --run-checker \
-      --submitter=MLCommons \
-      --tar=yes \
-      --env.CM_TAR_OUTFILE=submission.tar.gz \
-      --division=closed \
-      --category=datacenter \
-      --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes \
-      --quiet
+          --clean \
+          --preprocess_submission=yes \
+          --run-checker \
+          --submitter=MLCommons \
+          --tar=yes \
+          --env.CM_TAR_OUTFILE=submission.tar.gz \
+          --division=closed \
+          --category=datacenter \
+          --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes \
+          --quiet
     ```
 === "Open Edge"
     ### Open Edge Submission
     ```bash
        cm run script -tags=generate,inference,submission \
-      --clean \
-      --preprocess_submission=yes \
-      --run-checker \
-      --submitter=MLCommons \
-      --tar=yes \
-      --env.CM_TAR_OUTFILE=submission.tar.gz \
-      --division=open \
-      --category=edge \
-      --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes \
-      --quiet
+          --clean \
+          --preprocess_submission=yes \
+          --run-checker \
+          --submitter=MLCommons \
+          --tar=yes \
+          --env.CM_TAR_OUTFILE=submission.tar.gz \
+          --division=open \
+          --category=edge \
+          --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes \
+          --quiet
     ```
 === "Open Datacenter"
     ### Closed Datacenter Submission
     ```bash
        cm run script -tags=generate,inference,submission \
-      --clean \
-      --preprocess_submission=yes \
-      --run-checker \
-      --submitter=MLCommons \
-      --tar=yes \
-      --env.CM_TAR_OUTFILE=submission.tar.gz \
-      --division=open \
-      --category=datacenter \
-      --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes \
-      --quiet
+          --clean \
+          --preprocess_submission=yes \
+          --run-checker \
+          --submitter=MLCommons \
+          --tar=yes \
+          --env.CM_TAR_OUTFILE=submission.tar.gz \
+          --division=open \
+          --category=datacenter \
+          --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes \
+          --quiet
     ```
 
 * Use `--hw_name="My system name"` to give a meaningful system name. Examples can be seen [here](https://github.com/mlcommons/inference_results_v3.0/tree/main/open/cTuning/systems)

From e9dcf17a08939bba1f3d639c2616a532dfd71a16 Mon Sep 17 00:00:00 2001
From: anandhu-eng <anandhukicks@gmail.com>
Date: Wed, 14 Aug 2024 16:08:31 +0530
Subject: [PATCH 04/27] Added submission categories

---
 docs/index.md                              | 19 ++++++++++++++-----
 docs/{official_gh_index.md => index_gh.md} |  0
 2 files changed, 14 insertions(+), 5 deletions(-)
 rename docs/{official_gh_index.md => index_gh.md} (100%)

diff --git a/docs/index.md b/docs/index.md
index c77175e2f..c49c5a355 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -1,7 +1,7 @@
 # MLPerf Inference Benchmarks
 
 ## Overview
-This document provides details on various [MLPerf Inference Benchmarks](official_gh_index.md) categorized by tasks, models, and datasets. Each section lists the models performing similar tasks, with details on datasets, accuracy, and server latency constraints.
+This document provides details on various [MLPerf Inference Benchmarks](index_gh.md) categorized by tasks, models, and datasets. Each section lists the models performing similar tasks, with details on datasets, accuracy, and server latency constraints.
 
 ---
 
@@ -16,6 +16,7 @@ This document provides details on various [MLPerf Inference Benchmarks](official
 - **Equal Issue mode**: False
 - **Accuracy Variants**: None
 - **Precision**: fp32, int8
+- **Submission Category**: Data Center, Edge
 
 ---
 
@@ -30,7 +31,8 @@ This document provides details on various [MLPerf Inference Benchmarks](official
   - CLIP: 32.68631873 ≤ CLIP ≤ 31.81331801
 - **Equal Issue mode**: True
 - **Accuracy Variants**: None
-- **Precision**: fp32, fp16
+- **Precision**: fp32
+- **Submission Category**: Data Center, Edge
 
 ---
 
@@ -46,6 +48,7 @@ This document provides details on various [MLPerf Inference Benchmarks](official
 - **Accuracy Variants**: None
 - **Accuracy Variants**: 99% and 99.9% of reference model accuracy
 - **Precision**: fp32
+- **Submission Category**: Data Center, Edge
 
 ---
 
@@ -60,6 +63,7 @@ This document provides details on various [MLPerf Inference Benchmarks](official
 - **Equal Issue mode**: True
 - **Accuracy Variants**: 99% and 99.9% of reference model accuracy
 - **Precision**: fp32
+- **Submission Category**: Data Center, Edge
 
 ---
 
@@ -77,6 +81,7 @@ This document provides details on various [MLPerf Inference Benchmarks](official
 - **Equal Issue mode**: True
 - **Accuracy Variants**: 99% and 99.9% of reference model accuracy
 - **Precision**: fp32, int8
+- **Submission Category**: Data Center, Edge
 
 ### [LLAMA2-70B](benchmarks/language/llama2-70b.md)
 - **Dataset**: OpenORCA (GPT-4 split, max_seq_len=1024)
@@ -94,6 +99,7 @@ This document provides details on various [MLPerf Inference Benchmarks](official
 - **Equal Issue mode**: True
 - **Accuracy Variants**: 99% and 99.9% of reference model accuracy
 - **Precision**: fp32
+- **Submission Category**: Data Center
 
 ### 5.2. Text Summarization
 
@@ -111,6 +117,7 @@ This document provides details on various [MLPerf Inference Benchmarks](official
 - **Equal Issue mode**: True
 - **Accuracy Variants**: 99% and 99.9% of reference model accuracy
 - **Precision**: fp32
+- **Submission Category**: Data Center, Edge
 
 ### 5.3. Mixed Tasks (Question Answering, Math, and Code Generation)
 
@@ -133,8 +140,9 @@ This document provides details on various [MLPerf Inference Benchmarks](official
   - TTFT: 2000ms
   - TPOT: 200ms
 - **Equal Issue mode**: True
-- **Accuracy Variants**: None
+- **Accuracy Variants**: 99% and of reference model
 - **Precision**: fp16
+- **Submission Category**: Data Center
 
 ---
 
@@ -149,11 +157,12 @@ This document provides details on various [MLPerf Inference Benchmarks](official
 - **Equal Issue mode**: False
 - **Accuracy Variants**: 99% and 99.9% of reference model accuracy
 - **Precision**: fp32
+- **Submission Category**: Data Center
 
 ---
 
-### Participation Categories
-- **Datacenter Category**: All nine benchmarks can participate.
+### Submission Categories
+- **Datacenter Category**: All the benchmarks can participate.
 - **Edge Category**: All benchmarks except DLRMv2, LLAMA2, and Mixtral-8x7B can participate.
 
 ### High Accuracy Variants
diff --git a/docs/official_gh_index.md b/docs/index_gh.md
similarity index 100%
rename from docs/official_gh_index.md
rename to docs/index_gh.md

From 4f56494dd8bbbc9395900667f7066ce95fd91b82 Mon Sep 17 00:00:00 2001
From: anandhu-eng <anandhukicks@gmail.com>
Date: Wed, 14 Aug 2024 19:21:10 +0530
Subject: [PATCH 05/27] update submission page - generate submission with or
 w/o using CM for benchmarking

---
 docs/submission/index.md | 47 +++++++++++++++++++++++++++++++++++-----
 1 file changed, 42 insertions(+), 5 deletions(-)

diff --git a/docs/submission/index.md b/docs/submission/index.md
index ffd9371e7..6f6d82a39 100644
--- a/docs/submission/index.md
+++ b/docs/submission/index.md
@@ -3,14 +3,51 @@ hide:
   - toc
 ---
 
-If you follow the `cm run` commands under the individual model pages in the [benchmarks](../index.md) directory, all the valid results will get aggregated to the `cm cache` folder. Once all the results across all the modelsare ready you can use the following command to generate a valid submission tree compliant with the [MLPerf requirements](https://github.com/mlcommons/policies/blob/master/submission_rules.adoc#inference-1).
+=== "CM based benchmark"
+    If you have followed the `cm run` commands under the individual model pages in the [benchmarks](../index.md) directory, all the valid results will get aggregated to the `cm cache` folder. The following command could be used to browse the structure of inference results folder generated by CM.
+    ### Get results folder structure
+    ```bash
+    cm find cache --tags=get,mlperf,inference,results,dir | xargs tree
+    ```
+=== "Non CM based benchmark"
+    If you have not followed the `cm run` commands under the individual model pages in the [benchmarks](../index.md) directory, make sure that the result directory is structured in the following way. 
+    ```
+    └── SUT_Name
+        ├── cm_sut_info.json
+        ├── model_mapping.json
+        └── model
+            └── Scenario
+                └── loadgen_Mode
+                    ├── file-1
+                    ├── ...
+                    └── file-n
+    ```
+
+    !!! tip
+
+        - The `cm_sut_info.json` should contain the following keys
+            - `system_name`
+            - `implementation`
+            - `device`
+            - `framework`
+            - `run_config`
+        - The `model_mapping.json` is used to map the custom model full name to the official model name. The format of json file is:
+            ```
+            {
+                "custom_model_name_for_model1":"official_model_name_for_model1",
+                "custom_model_name_for_model2":"official_model_name_for_model2",
+
+            }
+            ```
+
+Once all the results across all the models are ready you can use the following command to generate a valid submission tree compliant with the [MLPerf requirements](https://github.com/mlcommons/policies/blob/master/submission_rules.adoc#inference-1).
 
 ## Generate actual submission tree
 
 === "Closed Edge"
     ### Closed Edge Submission
     ```bash
-       cm run script -tags=generate,inference,submission \
+       cm run script --tags=generate,inference,submission \
           --clean \
           --preprocess_submission=yes \
           --run-checker \
@@ -26,7 +63,7 @@ If you follow the `cm run` commands under the individual model pages in the [ben
 === "Closed Datacenter"
     ### Closed Datacenter Submission
     ```bash
-       cm run script -tags=generate,inference,submission \
+       cm run script --tags=generate,inference,submission \
           --clean \
           --preprocess_submission=yes \
           --run-checker \
@@ -41,7 +78,7 @@ If you follow the `cm run` commands under the individual model pages in the [ben
 === "Open Edge"
     ### Open Edge Submission
     ```bash
-       cm run script -tags=generate,inference,submission \
+       cm run script --tags=generate,inference,submission \
           --clean \
           --preprocess_submission=yes \
           --run-checker \
@@ -56,7 +93,7 @@ If you follow the `cm run` commands under the individual model pages in the [ben
 === "Open Datacenter"
     ### Closed Datacenter Submission
     ```bash
-       cm run script -tags=generate,inference,submission \
+       cm run script --tags=generate,inference,submission \
           --clean \
           --preprocess_submission=yes \
           --run-checker \

From f1135ead5ec149cc2c7badaf07ebc1381143e18a Mon Sep 17 00:00:00 2001
From: anandhu-eng <anandhukicks@gmail.com>
Date: Fri, 16 Aug 2024 14:52:45 +0530
Subject: [PATCH 06/27] Updated kits dataset documentation

---
 docs/benchmarks/medical_imaging/get-3d-unet-data.md | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/docs/benchmarks/medical_imaging/get-3d-unet-data.md b/docs/benchmarks/medical_imaging/get-3d-unet-data.md
index e5bbfbab6..6c361f6f1 100644
--- a/docs/benchmarks/medical_imaging/get-3d-unet-data.md
+++ b/docs/benchmarks/medical_imaging/get-3d-unet-data.md
@@ -12,9 +12,14 @@ The benchmark implementation run command will automatically download the validat
 === "Validation"
     3d-unet validation run uses the KiTS19 dataset performing [KiTS 2019](https://kits19.grand-challenge.org/) kidney tumor segmentation task
 
-    ### Get Validation Dataset
+    ### Get Validation Dataset(Original)
     ```
-    cm run script --tags=get,dataset,kits19,validation -j
+    cm run script --tags=get,dataset,kits19,_validation -j
+    ```
+
+    ### Get Validation Dataset(Preprocessed)
+    ```
+    cm run script --tags=get,dataset,kits19,preprocessed -j
     ```
 
 ## Model

From cb71cd167d50f6c53fa0603452c7bf2b72432a5c Mon Sep 17 00:00:00 2001
From: anandhu-eng <anandhukicks@gmail.com>
Date: Fri, 16 Aug 2024 21:28:36 +0530
Subject: [PATCH 07/27] Updated model parameters

---
 docs/index.md | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/docs/index.md b/docs/index.md
index c49c5a355..7b787441e 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -10,12 +10,12 @@ This document provides details on various [MLPerf Inference Benchmarks](index_gh
 - **Dataset**: Imagenet-2012 (224x224) Validation
   - **Dataset Size**: 50,000
   - **QSL Size**: 1,024
-- **Number of Parameters**: 25,636,712
+- **Number of Parameters**: 25.6 Million
 - **Reference Model Accuracy**: 76.46% ACC
 - **Server Scenario Latency Constraint**: 15ms
 - **Equal Issue mode**: False
 - **Accuracy Variants**: None
-- **Precision**: fp32, int8
+- **Precision**: fp32
 - **Submission Category**: Data Center, Edge
 
 ---
@@ -25,11 +25,11 @@ This document provides details on various [MLPerf Inference Benchmarks](index_gh
 - **Dataset**: Subset of Coco2014
   - **Dataset Size**: 5,000
   - **QSL Size**: 5,000
-- **Number of Parameters**: 890,000,000
+- **Number of Parameters**: 3.5 Billion
 - **Required Accuracy (Closed Division)**:
   - FID: 23.01085758 ≤ FID ≤ 23.95007626
   - CLIP: 32.68631873 ≤ CLIP ≤ 31.81331801
-- **Equal Issue mode**: True
+- **Equal Issue mode**: False
 - **Accuracy Variants**: None
 - **Precision**: fp32
 - **Submission Category**: Data Center, Edge
@@ -41,7 +41,7 @@ This document provides details on various [MLPerf Inference Benchmarks](index_gh
 - **Dataset**: OpenImages
   - **Dataset Size**: 24,781
   - **QSL Size**: 64
-- **Number of Parameters**: 50,000,000
+- **Number of Parameters**: TBD
 - **Reference Model Accuracy**: 0.3755 mAP
 - **Server Scenario Latency Constraint**: 100ms
 - **Equal Issue mode**: False
@@ -57,7 +57,7 @@ This document provides details on various [MLPerf Inference Benchmarks](index_gh
 - **Dataset**: KiTS2019
   - **Dataset Size**: 42
   - **QSL Size**: 42
-- **Number of Parameters**: 8,000,000
+- **Number of Parameters**: 19 Million <!-- taken from https://arxiv.org/pdf/1606.06650 -->
 - **Reference Model Accuracy**: 0.86330 Mean DICE Score
 - **Server Scenario**: Not Applicable
 - **Equal Issue mode**: True
@@ -75,10 +75,10 @@ This document provides details on various [MLPerf Inference Benchmarks](index_gh
 - **Dataset**: Squad v1.1 (384 Sequence Length)
   - **Dataset Size**: 10,833
   - **QSL Size**: 10,833
-- **Number of Parameters**: 345,000,000
+- **Number of Parameters**: 340 Million <!-- taken from https://huggingface.co/transformers/v2.9.1/pretrained_models.html -->
 - **Reference Model Accuracy**: F1 Score = 90.874%
 - **Server Scenario Latency Constraint**: 130ms
-- **Equal Issue mode**: True
+- **Equal Issue mode**: False
 - **Accuracy Variants**: 99% and 99.9% of reference model accuracy
 - **Precision**: fp32, int8
 - **Submission Category**: Data Center, Edge
@@ -87,7 +87,7 @@ This document provides details on various [MLPerf Inference Benchmarks](index_gh
 - **Dataset**: OpenORCA (GPT-4 split, max_seq_len=1024)
   - **Dataset Size**: 24,576
   - **QSL Size**: 24,576
-- **Number of Parameters**: 70,000,000,000
+- **Number of Parameters**: 70 Billion
 - **Reference Model Accuracy**:
   - Rouge1: 44.4312
   - Rouge2: 22.0352
@@ -107,7 +107,7 @@ This document provides details on various [MLPerf Inference Benchmarks](index_gh
 - **Dataset**: CNN Daily Mail v3.0.0
   - **Dataset Size**: 13,368
   - **QSL Size**: 13,368
-- **Number of Parameters**: 6,000,000,000
+- **Number of Parameters**: 6 Billion
 - **Reference Model Accuracy**:
   - Rouge1: 42.9865
   - Rouge2: 20.1235
@@ -128,7 +128,7 @@ This document provides details on various [MLPerf Inference Benchmarks](index_gh
   - MBXP (5k samples of the validation split, max_seq_len=2048)
   - **Dataset Size**: 15,000
   - **QSL Size**: 15,000
-- **Number of Parameters**: 56,000,000,000
+- **Number of Parameters**: 47 Billion <!-- https://huggingface.co/blog/moe -->
 - **Reference Model Accuracy**:
   - Rouge1: 45.4911
   - Rouge2: 23.2829
@@ -151,7 +151,7 @@ This document provides details on various [MLPerf Inference Benchmarks](index_gh
 - **Dataset**: Synthetic Multihot Criteo
   - **Dataset Size**: 204,800
   - **QSL Size**: 204,800
-- **Number of Parameters**: 1,000,000,000
+- **Number of Parameters**: TBD
 - **Reference Model Accuracy**: AUC = 80.31%
 - **Server Scenario Latency Constraint**: 60ms
 - **Equal Issue mode**: False

From 1579967ff341e3d69920150380abf4ba8070a144 Mon Sep 17 00:00:00 2001
From: anandhu-eng <anandhukicks@gmail.com>
Date: Mon, 19 Aug 2024 15:19:29 +0530
Subject: [PATCH 08/27] updation of information

---
 docs/index.md | 39 +++++++++++++++++++--------------------
 1 file changed, 19 insertions(+), 20 deletions(-)

diff --git a/docs/index.md b/docs/index.md
index 7b787441e..907c38ab2 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -11,11 +11,11 @@ This document provides details on various [MLPerf Inference Benchmarks](index_gh
   - **Dataset Size**: 50,000
   - **QSL Size**: 1,024
 - **Number of Parameters**: 25.6 Million
+- **FLOPs**: TBD
 - **Reference Model Accuracy**: 76.46% ACC
 - **Server Scenario Latency Constraint**: 15ms
 - **Equal Issue mode**: False
-- **Accuracy Variants**: None
-- **Precision**: fp32
+- **Accuracy Variants**: 99% of fp32 reference model accuracy
 - **Submission Category**: Data Center, Edge
 
 ---
@@ -25,13 +25,13 @@ This document provides details on various [MLPerf Inference Benchmarks](index_gh
 - **Dataset**: Subset of Coco2014
   - **Dataset Size**: 5,000
   - **QSL Size**: 5,000
-- **Number of Parameters**: 3.5 Billion
+- **Number of Parameters**: 3.5 Billion <!-- taken from https://stability.ai/news/stable-diffusion-sdxl-1-announcement -->
+- **FLOPs**: TBD
 - **Required Accuracy (Closed Division)**:
   - FID: 23.01085758 ≤ FID ≤ 23.95007626
   - CLIP: 32.68631873 ≤ CLIP ≤ 31.81331801
 - **Equal Issue mode**: False
-- **Accuracy Variants**: None
-- **Precision**: fp32
+- **Accuracy Variants**: 98% of fp32 reference model accuracy
 - **Submission Category**: Data Center, Edge
 
 ---
@@ -42,12 +42,11 @@ This document provides details on various [MLPerf Inference Benchmarks](index_gh
   - **Dataset Size**: 24,781
   - **QSL Size**: 64
 - **Number of Parameters**: TBD
+- **FLOPs**: TBD
 - **Reference Model Accuracy**: 0.3755 mAP
 - **Server Scenario Latency Constraint**: 100ms
 - **Equal Issue mode**: False
-- **Accuracy Variants**: None
-- **Accuracy Variants**: 99% and 99.9% of reference model accuracy
-- **Precision**: fp32
+- **Accuracy Variants**: 99% of fp32 reference model accuracy
 - **Submission Category**: Data Center, Edge
 
 ---
@@ -58,11 +57,11 @@ This document provides details on various [MLPerf Inference Benchmarks](index_gh
   - **Dataset Size**: 42
   - **QSL Size**: 42
 - **Number of Parameters**: 19 Million <!-- taken from https://arxiv.org/pdf/1606.06650 -->
+- **FLOPs**: TBD
 - **Reference Model Accuracy**: 0.86330 Mean DICE Score
 - **Server Scenario**: Not Applicable
 - **Equal Issue mode**: True
-- **Accuracy Variants**: 99% and 99.9% of reference model accuracy
-- **Precision**: fp32
+- **Accuracy Variants**: 99% and 99.9% of fp32 reference model accuracy
 - **Submission Category**: Data Center, Edge
 
 ---
@@ -76,11 +75,11 @@ This document provides details on various [MLPerf Inference Benchmarks](index_gh
   - **Dataset Size**: 10,833
   - **QSL Size**: 10,833
 - **Number of Parameters**: 340 Million <!-- taken from https://huggingface.co/transformers/v2.9.1/pretrained_models.html -->
+- **FLOPs**: TBD
 - **Reference Model Accuracy**: F1 Score = 90.874%
 - **Server Scenario Latency Constraint**: 130ms
 - **Equal Issue mode**: False
-- **Accuracy Variants**: 99% and 99.9% of reference model accuracy
-- **Precision**: fp32, int8
+- **Accuracy Variants**: 99% and 99.9% of fp32 reference model accuracy
 - **Submission Category**: Data Center, Edge
 
 ### [LLAMA2-70B](benchmarks/language/llama2-70b.md)
@@ -88,6 +87,7 @@ This document provides details on various [MLPerf Inference Benchmarks](index_gh
   - **Dataset Size**: 24,576
   - **QSL Size**: 24,576
 - **Number of Parameters**: 70 Billion
+- **FLOPs**: TBD
 - **Reference Model Accuracy**:
   - Rouge1: 44.4312
   - Rouge2: 22.0352
@@ -97,8 +97,7 @@ This document provides details on various [MLPerf Inference Benchmarks](index_gh
   - TTFT: 2000ms
   - TPOT: 200ms
 - **Equal Issue mode**: True
-- **Accuracy Variants**: 99% and 99.9% of reference model accuracy
-- **Precision**: fp32
+- **Accuracy Variants**: 99% and 99.9% of fp32 reference model accuracy
 - **Submission Category**: Data Center
 
 ### 5.2. Text Summarization
@@ -108,6 +107,7 @@ This document provides details on various [MLPerf Inference Benchmarks](index_gh
   - **Dataset Size**: 13,368
   - **QSL Size**: 13,368
 - **Number of Parameters**: 6 Billion
+- **FLOPs**: TBD
 - **Reference Model Accuracy**:
   - Rouge1: 42.9865
   - Rouge2: 20.1235
@@ -115,8 +115,7 @@ This document provides details on various [MLPerf Inference Benchmarks](index_gh
   - Gen_len: 4,016,878
 - **Server Scenario Latency Constraint**: 20s
 - **Equal Issue mode**: True
-- **Accuracy Variants**: 99% and 99.9% of reference model accuracy
-- **Precision**: fp32
+- **Accuracy Variants**: 99% and 99.9% of fp32 reference model accuracy
 - **Submission Category**: Data Center, Edge
 
 ### 5.3. Mixed Tasks (Question Answering, Math, and Code Generation)
@@ -129,6 +128,7 @@ This document provides details on various [MLPerf Inference Benchmarks](index_gh
   - **Dataset Size**: 15,000
   - **QSL Size**: 15,000
 - **Number of Parameters**: 47 Billion <!-- https://huggingface.co/blog/moe -->
+- **FLOPs**: TBD
 - **Reference Model Accuracy**:
   - Rouge1: 45.4911
   - Rouge2: 23.2829
@@ -140,8 +140,7 @@ This document provides details on various [MLPerf Inference Benchmarks](index_gh
   - TTFT: 2000ms
   - TPOT: 200ms
 - **Equal Issue mode**: True
-- **Accuracy Variants**: 99% and of reference model
-- **Precision**: fp16
+- **Accuracy Variants**: 99% and of fp16 reference model
 - **Submission Category**: Data Center
 
 ---
@@ -152,11 +151,11 @@ This document provides details on various [MLPerf Inference Benchmarks](index_gh
   - **Dataset Size**: 204,800
   - **QSL Size**: 204,800
 - **Number of Parameters**: TBD
+- **FLOPs**: TBD
 - **Reference Model Accuracy**: AUC = 80.31%
 - **Server Scenario Latency Constraint**: 60ms
 - **Equal Issue mode**: False
-- **Accuracy Variants**: 99% and 99.9% of reference model accuracy
-- **Precision**: fp32
+- **Accuracy Variants**: 99% and 99.9% of fp32 reference model accuracy
 - **Submission Category**: Data Center
 
 ---

From 5ba36a9a9ef232c3de92c7bcecab721261588c61 Mon Sep 17 00:00:00 2001
From: anandhu-eng <anandhukicks@gmail.com>
Date: Mon, 19 Aug 2024 15:20:12 +0530
Subject: [PATCH 09/27] updated non cm based benchmark

---
 docs/submission/index.md | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/docs/submission/index.md b/docs/submission/index.md
index 6f6d82a39..9e6b66044 100644
--- a/docs/submission/index.md
+++ b/docs/submission/index.md
@@ -10,11 +10,10 @@ hide:
     cm find cache --tags=get,mlperf,inference,results,dir | xargs tree
     ```
 === "Non CM based benchmark"
-    If you have not followed the `cm run` commands under the individual model pages in the [benchmarks](../index.md) directory, make sure that the result directory is structured in the following way. 
+    If you have not followed the `cm run` commands under the individual model pages in the [benchmarks](../index.md) directory, please make sure that the result directory is structured in the following way. 
     ```
     └── SUT_Name
         ├── cm_sut_info.json
-        ├── model_mapping.json
         └── model
             └── Scenario
                 └── loadgen_Mode
@@ -31,14 +30,20 @@ hide:
             - `device`
             - `framework`
             - `run_config`
-        - The `model_mapping.json` is used to map the custom model full name to the official model name. The format of json file is:
-            ```
-            {
-                "custom_model_name_for_model1":"official_model_name_for_model1",
-                "custom_model_name_for_model2":"official_model_name_for_model2",
-
-            }
-            ```
+    
+    <details>
+    <summary>Click here if you are submitting custom model in open division</summary>
+
+    * The `model_mapping.json` should be included inside the SUT folder which is used to map the custom model full name to the official model name. The format of json file is:
+
+    ```
+        {
+            "custom_model_name_for_model1":"official_model_name_for_model1",
+            "custom_model_name_for_model2":"official_model_name_for_model2",
+
+        }
+    ```
+    </details>
 
 Once all the results across all the models are ready you can use the following command to generate a valid submission tree compliant with the [MLPerf requirements](https://github.com/mlcommons/policies/blob/master/submission_rules.adoc#inference-1).
 

From a46164678667ca7ccd8764cd7c80fdd35751e596 Mon Sep 17 00:00:00 2001
From: anandhu-eng <anandhukicks@gmail.com>
Date: Tue, 20 Aug 2024 11:09:31 +0530
Subject: [PATCH 10/27] added info about hf password

---
 docs/benchmarks/language/get-llama2-70b-data.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/benchmarks/language/get-llama2-70b-data.md b/docs/benchmarks/language/get-llama2-70b-data.md
index 917f4705e..bdbb97b87 100644
--- a/docs/benchmarks/language/get-llama2-70b-data.md
+++ b/docs/benchmarks/language/get-llama2-70b-data.md
@@ -28,4 +28,8 @@ Get the Official MLPerf LLAMA2-70b Model
     ```
     cm run script --tags=get,ml-model,llama2-70b,_pytorch -j
     ```
+  
+!!! tip
+
+    Downloading llama2-70B model from Hugging Face will prompt you to enter the Hugging Face username and password. Please note that the password required is the **access token** generated for your account. Additionally, ensure that your account has access to the llama2-70B model.
 

From dd47e451e59005aaf90bdf6f4c0d56da614568fe Mon Sep 17 00:00:00 2001
From: anandhu-eng <anandhukicks@gmail.com>
Date: Tue, 20 Aug 2024 14:14:13 +0530
Subject: [PATCH 11/27] added links to model and access tokens

---
 docs/benchmarks/language/get-llama2-70b-data.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/benchmarks/language/get-llama2-70b-data.md b/docs/benchmarks/language/get-llama2-70b-data.md
index bdbb97b87..0214d95a5 100644
--- a/docs/benchmarks/language/get-llama2-70b-data.md
+++ b/docs/benchmarks/language/get-llama2-70b-data.md
@@ -31,5 +31,5 @@ Get the Official MLPerf LLAMA2-70b Model
   
 !!! tip
 
-    Downloading llama2-70B model from Hugging Face will prompt you to enter the Hugging Face username and password. Please note that the password required is the **access token** generated for your account. Additionally, ensure that your account has access to the llama2-70B model.
+    Downloading llama2-70B model from Hugging Face will prompt you to enter the Hugging Face username and password. Please note that the password required is the [**access token**](https://huggingface.co/settings/tokens) generated for your account. Additionally, ensure that your account has access to the [llama2-70B](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) model.
 

From a1d66d4ef66fbf15de8ef0fc7270d2dd06d3e52f Mon Sep 17 00:00:00 2001
From: anandhu-eng <anandhukicks@gmail.com>
Date: Tue, 20 Aug 2024 17:06:04 +0530
Subject: [PATCH 12/27] Updated reference results structuree tree

---
 docs/submission/index.md | 31 ++++++++++++++++++++++++-------
 1 file changed, 24 insertions(+), 7 deletions(-)

diff --git a/docs/submission/index.md b/docs/submission/index.md
index 9e6b66044..4c892379b 100644
--- a/docs/submission/index.md
+++ b/docs/submission/index.md
@@ -12,20 +12,37 @@ hide:
 === "Non CM based benchmark"
     If you have not followed the `cm run` commands under the individual model pages in the [benchmarks](../index.md) directory, please make sure that the result directory is structured in the following way. 
     ```
-    └── SUT_Name
+    └── System description ID
         ├── cm_sut_info.json
-        └── model
+        └── Benchmark
             └── Scenario
-                └── loadgen_Mode
-                    ├── file-1
-                    ├── ...
-                    └── file-n
+                ├── Performance
+                |   └── run_x/#1 run for all scenarios
+                |       ├── mlperf_log_summary.txt
+                |       └── mlperf_log_detail.txt
+                ├── Accuracy
+                |   ├── mlperf_log_summary.txt
+                |   ├── mlperf_log_detail.txt
+                |   ├── mlperf_log_accuracy.json
+                |   └── accuracy.txt
+                └── Test_ID
+                    ├── Performance
+                    |   └── run_x/#1 run for all scenarios
+                    |       ├── mlperf_log_summary.txt
+                    |       └── mlperf_log_detail.txt
+                    ├── Accuracy
+                    |   ├── baseline_accuracy.txt
+                    |   ├── compliance_accuracy.txt
+                    |   ├── mlperf_log_accuracy.json
+                    |   └── accuracy.txt
+                    ├── verify_performance.txt
+                    └── verify_accuracy.txt #for TEST01 only
     ```
 
     !!! tip
 
         - The `cm_sut_info.json` should contain the following keys
-            - `system_name`
+            - `hardware_name`
             - `implementation`
             - `device`
             - `framework`

From c5ae6ed9ae6877a99efc657a64664e426ba9e6f2 Mon Sep 17 00:00:00 2001
From: anandhu-eng <anandhukicks@gmail.com>
Date: Tue, 20 Aug 2024 18:23:20 +0530
Subject: [PATCH 13/27] submission docs cleanup

---
 docs/submission/index.md | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

diff --git a/docs/submission/index.md b/docs/submission/index.md
index 4c892379b..a75bc3259 100644
--- a/docs/submission/index.md
+++ b/docs/submission/index.md
@@ -12,8 +12,8 @@ hide:
 === "Non CM based benchmark"
     If you have not followed the `cm run` commands under the individual model pages in the [benchmarks](../index.md) directory, please make sure that the result directory is structured in the following way. 
     ```
-    └── System description ID
-        ├── cm_sut_info.json
+    └── System description ID(SUT Name)
+        ├── system_meta.json
         └── Benchmark
             └── Scenario
                 ├── Performance
@@ -25,7 +25,7 @@ hide:
                 |   ├── mlperf_log_detail.txt
                 |   ├── mlperf_log_accuracy.json
                 |   └── accuracy.txt
-                └── Test_ID
+                └── Compliance_Test_ID
                     ├── Performance
                     |   └── run_x/#1 run for all scenarios
                     |       ├── mlperf_log_summary.txt
@@ -38,18 +38,9 @@ hide:
                     ├── verify_performance.txt
                     └── verify_accuracy.txt #for TEST01 only
     ```
-
-    !!! tip
-
-        - The `cm_sut_info.json` should contain the following keys
-            - `hardware_name`
-            - `implementation`
-            - `device`
-            - `framework`
-            - `run_config`
     
     <details>
-    <summary>Click here if you are submitting custom model in open division</summary>
+    <summary>Click here if you are submitting in open division</summary>
 
     * The `model_mapping.json` should be included inside the SUT folder which is used to map the custom model full name to the official model name. The format of json file is:
 

From bc80f65edf70fc12c8ac1b0da98ebccf8b0db1a8 Mon Sep 17 00:00:00 2001
From: Arjun Suresh <arjunsuresh1987@gmail.com>
Date: Thu, 22 Aug 2024 18:10:16 +0100
Subject: [PATCH 14/27] Some cleanups for benchmark info

---
 docs/index.md | 81 +++++++++++++++++++++++++--------------------------
 1 file changed, 39 insertions(+), 42 deletions(-)

diff --git a/docs/index.md b/docs/index.md
index 907c38ab2..b408e8212 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -10,13 +10,13 @@ This document provides details on various [MLPerf Inference Benchmarks](index_gh
 - **Dataset**: Imagenet-2012 (224x224) Validation
   - **Dataset Size**: 50,000
   - **QSL Size**: 1,024
-- **Number of Parameters**: 25.6 Million
-- **FLOPs**: TBD
+- **Number of Parameters**: 25.6 million
+- **FLOPs**: 3.8 billion
 - **Reference Model Accuracy**: 76.46% ACC
 - **Server Scenario Latency Constraint**: 15ms
 - **Equal Issue mode**: False
-- **Accuracy Variants**: 99% of fp32 reference model accuracy
-- **Submission Category**: Data Center, Edge
+- **High accuracy variant**: No
+- **Submission Category**: Datacenter, Edge
 
 ---
 
@@ -25,14 +25,14 @@ This document provides details on various [MLPerf Inference Benchmarks](index_gh
 - **Dataset**: Subset of Coco2014
   - **Dataset Size**: 5,000
   - **QSL Size**: 5,000
-- **Number of Parameters**: 3.5 Billion <!-- taken from https://stability.ai/news/stable-diffusion-sdxl-1-announcement -->
-- **FLOPs**: TBD
+- **Number of Parameters**: 3.5 billion <!-- taken from https://stability.ai/news/stable-diffusion-sdxl-1-announcement -->
+- **FLOPs**: 1.28 - 2.4 trillion
 - **Required Accuracy (Closed Division)**:
   - FID: 23.01085758 ≤ FID ≤ 23.95007626
   - CLIP: 32.68631873 ≤ CLIP ≤ 31.81331801
 - **Equal Issue mode**: False
-- **Accuracy Variants**: 98% of fp32 reference model accuracy
-- **Submission Category**: Data Center, Edge
+- **High accuracy variant**: No
+- **Submission Category**: Datacenter, Edge
 
 ---
 
@@ -42,27 +42,26 @@ This document provides details on various [MLPerf Inference Benchmarks](index_gh
   - **Dataset Size**: 24,781
   - **QSL Size**: 64
 - **Number of Parameters**: TBD
-- **FLOPs**: TBD
-- **Reference Model Accuracy**: 0.3755 mAP
+- **Reference Model Accuracy (fp32) **: 0.3755 mAP
 - **Server Scenario Latency Constraint**: 100ms
 - **Equal Issue mode**: False
-- **Accuracy Variants**: 99% of fp32 reference model accuracy
-- **Submission Category**: Data Center, Edge
+- **High accuracy variant**: Yes
+- **Submission Category**: Datacenter, Edge
 
 ---
 
 ## 4. Medical Image Segmentation
-### [3d-unet](benchmarks/medical_imaging/3d-unet.md)
+### [3d-unet](benchmarks/medical_imaging/3d-unet.md) <!-- https://ar5iv.labs.arxiv.org/html/1809.10483v2 -->
 - **Dataset**: KiTS2019
   - **Dataset Size**: 42
   - **QSL Size**: 42
-- **Number of Parameters**: 19 Million <!-- taken from https://arxiv.org/pdf/1606.06650 -->
-- **FLOPs**: TBD
-- **Reference Model Accuracy**: 0.86330 Mean DICE Score
+- **Number of Parameters**: 32.5 million
+- **FLOPs**: 100-300 billion
+- **Reference Model Accuracy (fp32) **: 0.86330 Mean DICE Score
 - **Server Scenario**: Not Applicable
 - **Equal Issue mode**: True
-- **Accuracy Variants**: 99% and 99.9% of fp32 reference model accuracy
-- **Submission Category**: Data Center, Edge
+- **High accuracy variant**: Yes
+- **Submission Category**: Datacenter, Edge
 
 ---
 
@@ -74,21 +73,21 @@ This document provides details on various [MLPerf Inference Benchmarks](index_gh
 - **Dataset**: Squad v1.1 (384 Sequence Length)
   - **Dataset Size**: 10,833
   - **QSL Size**: 10,833
-- **Number of Parameters**: 340 Million <!-- taken from https://huggingface.co/transformers/v2.9.1/pretrained_models.html -->
-- **FLOPs**: TBD
-- **Reference Model Accuracy**: F1 Score = 90.874%
+- **Number of Parameters**: 340 million <!-- taken from https://huggingface.co/transformers/v2.9.1/pretrained_models.html -->
+- **FLOPs**: ~128 billion
+- **Reference Model Accuracy (fp32) **: F1 Score = 90.874%
 - **Server Scenario Latency Constraint**: 130ms
 - **Equal Issue mode**: False
-- **Accuracy Variants**: 99% and 99.9% of fp32 reference model accuracy
-- **Submission Category**: Data Center, Edge
+- **High accuracy variant**: yes
+- **Submission Category**: Datacenter, Edge
 
 ### [LLAMA2-70B](benchmarks/language/llama2-70b.md)
 - **Dataset**: OpenORCA (GPT-4 split, max_seq_len=1024)
   - **Dataset Size**: 24,576
   - **QSL Size**: 24,576
-- **Number of Parameters**: 70 Billion
-- **FLOPs**: TBD
-- **Reference Model Accuracy**:
+- **Number of Parameters**: 70 billion
+- **FLOPs**: ~500 trillion
+- **Reference Model Accuracy (fp16) **:
   - Rouge1: 44.4312
   - Rouge2: 22.0352
   - RougeL: 28.6162
@@ -97,8 +96,8 @@ This document provides details on various [MLPerf Inference Benchmarks](index_gh
   - TTFT: 2000ms
   - TPOT: 200ms
 - **Equal Issue mode**: True
-- **Accuracy Variants**: 99% and 99.9% of fp32 reference model accuracy
-- **Submission Category**: Data Center
+- **High accuracy variant**: Yes
+- **Submission Category**: Datacenter
 
 ### 5.2. Text Summarization
 
@@ -106,17 +105,17 @@ This document provides details on various [MLPerf Inference Benchmarks](index_gh
 - **Dataset**: CNN Daily Mail v3.0.0
   - **Dataset Size**: 13,368
   - **QSL Size**: 13,368
-- **Number of Parameters**: 6 Billion
-- **FLOPs**: TBD
-- **Reference Model Accuracy**:
+- **Number of Parameters**: 6 billion
+- **FLOPs**: ~148 billion
+- **Reference Model Accuracy (fp32) **:
   - Rouge1: 42.9865
   - Rouge2: 20.1235
   - RougeL: 29.9881
   - Gen_len: 4,016,878
 - **Server Scenario Latency Constraint**: 20s
 - **Equal Issue mode**: True
-- **Accuracy Variants**: 99% and 99.9% of fp32 reference model accuracy
-- **Submission Category**: Data Center, Edge
+- **High accuracy variant**: Yes
+- **Submission Category**: Datacenter, Edge
 
 ### 5.3. Mixed Tasks (Question Answering, Math, and Code Generation)
 
@@ -127,9 +126,8 @@ This document provides details on various [MLPerf Inference Benchmarks](index_gh
   - MBXP (5k samples of the validation split, max_seq_len=2048)
   - **Dataset Size**: 15,000
   - **QSL Size**: 15,000
-- **Number of Parameters**: 47 Billion <!-- https://huggingface.co/blog/moe -->
-- **FLOPs**: TBD
-- **Reference Model Accuracy**:
+- **Number of Parameters**: 47 billion <!-- https://huggingface.co/blog/moe -->
+- **Reference Model Accuracy (fp16) **:
   - Rouge1: 45.4911
   - Rouge2: 23.2829
   - RougeL: 30.3615
@@ -140,8 +138,8 @@ This document provides details on various [MLPerf Inference Benchmarks](index_gh
   - TTFT: 2000ms
   - TPOT: 200ms
 - **Equal Issue mode**: True
-- **Accuracy Variants**: 99% and of fp16 reference model
-- **Submission Category**: Data Center
+- **High accuracy variant**: Yes
+- **Submission Category**: Datacenter
 
 ---
 
@@ -150,13 +148,12 @@ This document provides details on various [MLPerf Inference Benchmarks](index_gh
 - **Dataset**: Synthetic Multihot Criteo
   - **Dataset Size**: 204,800
   - **QSL Size**: 204,800
-- **Number of Parameters**: TBD
-- **FLOPs**: TBD
+- **Number of Parameters**: ~23 billion
 - **Reference Model Accuracy**: AUC = 80.31%
 - **Server Scenario Latency Constraint**: 60ms
 - **Equal Issue mode**: False
-- **Accuracy Variants**: 99% and 99.9% of fp32 reference model accuracy
-- **Submission Category**: Data Center
+- **High accuracy variant**: Yes
+- **Submission Category**: Datacenter
 
 ---
 

From 65e63db60ddb8c99cee7b41375a1f064cc5eb7e2 Mon Sep 17 00:00:00 2001
From: Arjun Suresh <arjunsuresh1987@gmail.com>
Date: Thu, 22 Aug 2024 18:14:59 +0100
Subject: [PATCH 15/27] Some cleanups for benchmark info

---
 docs/index.md | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/docs/index.md b/docs/index.md
index b408e8212..4c8530197 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -88,10 +88,7 @@ This document provides details on various [MLPerf Inference Benchmarks](index_gh
 - **Number of Parameters**: 70 billion
 - **FLOPs**: ~500 trillion
 - **Reference Model Accuracy (fp16) **:
-  - Rouge1: 44.4312
-  - Rouge2: 22.0352
-  - RougeL: 28.6162
-  - Tokens_per_sample: 294.45
+  - Rouge1: 44.4312, Rouge2: 22.0352, RougeL: 28.6162, Tokens_per_sample: 294.45
 - **Server Scenario Latency Constraint**:
   - TTFT: 2000ms
   - TPOT: 200ms
@@ -108,10 +105,7 @@ This document provides details on various [MLPerf Inference Benchmarks](index_gh
 - **Number of Parameters**: 6 billion
 - **FLOPs**: ~148 billion
 - **Reference Model Accuracy (fp32) **:
-  - Rouge1: 42.9865
-  - Rouge2: 20.1235
-  - RougeL: 29.9881
-  - Gen_len: 4,016,878
+  - Rouge1: 42.9865, Rouge2: 20.1235, RougeL: 29.9881, Gen_len: 4,016,878
 - **Server Scenario Latency Constraint**: 20s
 - **Equal Issue mode**: True
 - **High accuracy variant**: Yes
@@ -128,9 +122,7 @@ This document provides details on various [MLPerf Inference Benchmarks](index_gh
   - **QSL Size**: 15,000
 - **Number of Parameters**: 47 billion <!-- https://huggingface.co/blog/moe -->
 - **Reference Model Accuracy (fp16) **:
-  - Rouge1: 45.4911
-  - Rouge2: 23.2829
-  - RougeL: 30.3615
+  - Rouge1: 45.4911, Rouge2: 23.2829, RougeL: 30.3615
   - GSM8K Accuracy: 73.78%
   - MBXP Accuracy: 60.12%
   - Tokens_per_sample: 294.45

From ba9820d4f1152b4ee0975b118539ec21a472616f Mon Sep 17 00:00:00 2001
From: Arjun Suresh <arjunsuresh1987@gmail.com>
Date: Thu, 22 Aug 2024 18:18:46 +0100
Subject: [PATCH 16/27] Some cleanups for benchmark info

---
 docs/index.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/index.md b/docs/index.md
index 4c8530197..aed0438d7 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -150,9 +150,9 @@ This document provides details on various [MLPerf Inference Benchmarks](index_gh
 ---
 
 ### Submission Categories
-- **Datacenter Category**: All the benchmarks can participate.
-- **Edge Category**: All benchmarks except DLRMv2, LLAMA2, and Mixtral-8x7B can participate.
+- **Datacenter Category**: All the current inference benchmarks are applicable to the datacenter category.
+- **Edge Category**: All benchmarks except DLRMv2, LLAMA2-70B, and Mixtral-8x7B are applicable to the edge category.
 
 ### High Accuracy Variants
-- **Benchmarks**: `bert`, `llama2-70b`, `dlrm_v2`, and `3d-unet`
+- **Benchmarks**: `bert`, `llama2-70b`, `gpt-j`,  `dlrm_v2`, and `3d-unet` have a normal accuracy variant as well as a high accuracy variant.
 - **Requirement**: Must achieve at least 99.9% of the reference model accuracy, compared to the default 99% accuracy requirement.

From 9ea1d14092b78a284494eaac025c63d94ee5ab96 Mon Sep 17 00:00:00 2001
From: anandhu-eng <anandhukicks@gmail.com>
Date: Fri, 23 Aug 2024 00:05:45 +0530
Subject: [PATCH 17/27] added generic stubs deepsparse

---
 main.py | 35 ++++++++++++++++++++++++++++++-----
 1 file changed, 30 insertions(+), 5 deletions(-)

diff --git a/main.py b/main.py
index 4705ead83..73690b1ec 100644
--- a/main.py
+++ b/main.py
@@ -24,7 +24,7 @@ def mlperf_inference_implementation_readme(spaces, model, implementation):
             elif model.lower() == "retinanet":
                  frameworks = [ "Onnxruntime", "Pytorch" ]
             elif "bert" in model.lower():
-                 frameworks = [ "Pytorch" ]
+                 frameworks = [ "Pytorch", "deepsparse" ]
             else:
                  frameworks = [ "Pytorch" ]
 
@@ -157,7 +157,28 @@ def mlperf_inference_implementation_readme(spaces, model, implementation):
                         run_suffix += f"{cur_space3}<summary> Please click here to see more options for the RUN command</summary>\n\n"
                         run_suffix += f"{cur_space3}* Use `--division=closed` to do a closed division submission which includes compliance runs\n\n"
                         run_suffix += f"{cur_space3}* Use `--rerun` to do a rerun even when a valid run exists\n"  
-                        run_suffix += f"{cur_space3}</details>\n"
+                        run_suffix += f"{cur_space3}</details>\n\n"
+
+                        if "bert" in model.lower() and framework == "deepsparse":
+                            run_suffix += f"{cur_space3}<details>\n"
+                            run_suffix += f"{cur_space3}<summary> Please click here for generic model stubs for bert deepsparse</summary>\n\n"
+                            run_suffix += f"{cur_space3}* zoo:nlp/question_answering/obert-large/pytorch/huggingface/squad/pruned95_quant-none-vnni\n\n" 
+                            run_suffix += f"{cur_space3}* zoo:nlp/question_answering/mobilebert-none/pytorch/huggingface/squad/14layer_pruned50_quant-none-vnni\n\n" 
+                            run_suffix += f"{cur_space3}* zoo:nlp/question_answering/mobilebert-none/pytorch/huggingface/squad/base_quant-none\n\n"
+                            run_suffix += f"{cur_space3}* zoo:nlp/question_answering/bert-base/pytorch/huggingface/squad/pruned95_obs_quant-none\n\n"
+                            run_suffix += f"{cur_space3}* zoo:nlp/question_answering/mobilebert-none/pytorch/huggingface/squad/14layer_pruned50-none-vnni\n\n"
+                            run_suffix += f"{cur_space3}* zoo:nlp/question_answering/obert-base/pytorch/huggingface/squad/pruned90-none\n\n"
+                            run_suffix += f"{cur_space3}* zoo:nlp/question_answering/obert-large/pytorch/huggingface/squad/pruned97_quant-none\n\n"
+                            run_suffix += f"{cur_space3}* zoo:nlp/question_answering/bert-base/pytorch/huggingface/squad/pruned90-none\n\n"
+                            run_suffix += f"{cur_space3}* zoo:nlp/question_answering/bert-large/pytorch/huggingface/squad/pruned80_quant-none-vnni\n\n"
+                            run_suffix += f"{cur_space3}* zoo:nlp/question_answering/obert-large/pytorch/huggingface/squad/pruned95-none-vnni\n\n"
+                            run_suffix += f"{cur_space3}* zoo:nlp/question_answering/obert-large/pytorch/huggingface/squad/pruned97-none\n\n"
+                            run_suffix += f"{cur_space3}* zoo:nlp/question_answering/bert-large/pytorch/huggingface/squad/base-none\n\n"
+                            run_suffix += f"{cur_space3}* zoo:nlp/question_answering/obert-large/pytorch/huggingface/squad/base-none\n\n"
+                            run_suffix += f"{cur_space3}* zoo:nlp/question_answering/mobilebert-none/pytorch/huggingface/squad/base-none\n"
+                            run_suffix += f"{cur_space3}</details>\n"
+
+                        
 
                         for scenario in scenarios:
                             content += f"{cur_space3}=== \"{scenario}\"\n{cur_space4}###### {scenario}\n\n"
@@ -287,7 +308,9 @@ def mlperf_inference_run_command(spaces, model, implementation, framework, categ
             docker_cmd_suffix = f" \\\n{pre_space} --docker --quiet"
             docker_cmd_suffix += f" \\\n{pre_space} --test_query_count={test_query_count}"
             
-            if "llama2-70b" in model:
+            if "bert" in model.lower() and framework == "deepsparse":
+                docker_cmd_suffix += f"\\\n{pre_space} --env.CM_MLPERF_NEURALMAGIC_MODEL_ZOO_STUB=zoo:nlp/question_answering/mobilebert-none/pytorch/huggingface/squad/base_quant-none"
+            if "llama2-70b" in model.lower():
                 if implementation == "nvidia":
                     docker_cmd_suffix += f" \\\n{pre_space} --tp_size=2"
                     docker_cmd_suffix += f" \\\n{pre_space} --nvidia_llama2_dataset_file_path=<PATH_TO_PICKE_FILE>"
@@ -295,7 +318,7 @@ def mlperf_inference_run_command(spaces, model, implementation, framework, categ
                     docker_cmd_suffix += f" \\\n{pre_space} --api_server=http://localhost:8000"
                     docker_cmd_suffix += f" \\\n{pre_space} --vllm_model_name=nm-testing/Llama-2-70b-chat-hf-FP8"
             
-            if "dlrm-v2" in model and implementation == "nvidia":
+            if "dlrm-v2" in model.lower() and implementation == "nvidia":
                 docker_cmd_suffix += f" \\\n{pre_space} --criteo_day23_raw_data_path=<PATH_TO_CRITEO_DAY23_RAW_DATA>"
 
             docker_setup_cmd = f"""\n
@@ -317,7 +340,9 @@ def mlperf_inference_run_command(spaces, model, implementation, framework, categ
             if execution_mode == "test":
                 cmd_suffix += f" \\\n {pre_space} --test_query_count={test_query_count}"
 
-            if "llama2-70b" in model:
+            if "bert" in model.lower() and framework == "deepsparse":
+                cmd_suffix += f"\\\n{pre_space} --env.CM_MLPERF_NEURALMAGIC_MODEL_ZOO_STUB=zoo:nlp/question_answering/mobilebert-none/pytorch/huggingface/squad/base_quant-none"
+            if "llama2-70b" in model.lower():
                 if implementation == "nvidia":
                     cmd_suffix += f" \\\n{pre_space} --tp_size=<TP_SIZE>"
                     cmd_suffix += f" \\\n{pre_space} --nvidia_llama2_dataset_file_path=<PATH_TO_PICKE_FILE>"

From 63888bcf4f7398ee69a1a984e61fe2e26b7c6f95 Mon Sep 17 00:00:00 2001
From: Arjun Suresh <arjunsuresh1987@gmail.com>
Date: Thu, 22 Aug 2024 19:44:51 +0100
Subject: [PATCH 18/27] Some cleanups for benchmark info

---
 docs/index.md | 44 ++++++++++++++++++++++----------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/docs/index.md b/docs/index.md
index aed0438d7..289ea509a 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -1,11 +1,11 @@
 # MLPerf Inference Benchmarks
 
 ## Overview
-This document provides details on various [MLPerf Inference Benchmarks](index_gh.md) categorized by tasks, models, and datasets. Each section lists the models performing similar tasks, with details on datasets, accuracy, and server latency constraints.
+The currently valid [MLPerf Inference Benchmarks](index_gh.md) as of MLPerf inference v4.0 round are listed below, categorized by tasks. Under each model you can find its details like the dataset used, accuracy, server latency constraints etc.
 
 ---
 
-## 1. Image Classification
+## Image Classification
 ### [ResNet50-v1.5](benchmarks/image_classification/resnet50.md)
 - **Dataset**: Imagenet-2012 (224x224) Validation
   - **Dataset Size**: 50,000
@@ -20,7 +20,7 @@ This document provides details on various [MLPerf Inference Benchmarks](index_gh
 
 ---
 
-## 2. Text to Image
+## Text to Image
 ### [Stable Diffusion](benchmarks/text_to_image/sdxl.md)
 - **Dataset**: Subset of Coco2014
   - **Dataset Size**: 5,000
@@ -28,15 +28,15 @@ This document provides details on various [MLPerf Inference Benchmarks](index_gh
 - **Number of Parameters**: 3.5 billion <!-- taken from https://stability.ai/news/stable-diffusion-sdxl-1-announcement -->
 - **FLOPs**: 1.28 - 2.4 trillion
 - **Required Accuracy (Closed Division)**:
-  - FID: 23.01085758 ≤ FID ≤ 23.95007626
-  - CLIP: 32.68631873 ≤ CLIP ≤ 31.81331801
+    - FID: 23.01085758 ≤ FID ≤ 23.95007626
+    - CLIP: 32.68631873 ≤ CLIP ≤ 31.81331801
 - **Equal Issue mode**: False
 - **High accuracy variant**: No
 - **Submission Category**: Datacenter, Edge
 
 ---
 
-## 3. Object Detection
+## Object Detection
 ### [Retinanet](benchmarks/object_detection/retinanet.md)
 - **Dataset**: OpenImages
   - **Dataset Size**: 24,781
@@ -50,7 +50,7 @@ This document provides details on various [MLPerf Inference Benchmarks](index_gh
 
 ---
 
-## 4. Medical Image Segmentation
+## Medical Image Segmentation
 ### [3d-unet](benchmarks/medical_imaging/3d-unet.md) <!-- https://ar5iv.labs.arxiv.org/html/1809.10483v2 -->
 - **Dataset**: KiTS2019
   - **Dataset Size**: 42
@@ -65,11 +65,11 @@ This document provides details on various [MLPerf Inference Benchmarks](index_gh
 
 ---
 
-## 5. Language Tasks
+## Language Tasks
 
-### 5.1. Question Answering
+### Question Answering
 
-### [Bert-Large](benchmarks/language/bert.md)
+#### [Bert-Large](benchmarks/language/bert.md)
 - **Dataset**: Squad v1.1 (384 Sequence Length)
   - **Dataset Size**: 10,833
   - **QSL Size**: 10,833
@@ -81,7 +81,7 @@ This document provides details on various [MLPerf Inference Benchmarks](index_gh
 - **High accuracy variant**: yes
 - **Submission Category**: Datacenter, Edge
 
-### [LLAMA2-70B](benchmarks/language/llama2-70b.md)
+#### [LLAMA2-70B](benchmarks/language/llama2-70b.md)
 - **Dataset**: OpenORCA (GPT-4 split, max_seq_len=1024)
   - **Dataset Size**: 24,576
   - **QSL Size**: 24,576
@@ -96,9 +96,9 @@ This document provides details on various [MLPerf Inference Benchmarks](index_gh
 - **High accuracy variant**: Yes
 - **Submission Category**: Datacenter
 
-### 5.2. Text Summarization
+###  Text Summarization
 
-### [GPT-J](benchmarks/language/gpt-j.md)
+#### [GPT-J](benchmarks/language/gpt-j.md)
 - **Dataset**: CNN Daily Mail v3.0.0
   - **Dataset Size**: 13,368
   - **QSL Size**: 13,368
@@ -111,13 +111,13 @@ This document provides details on various [MLPerf Inference Benchmarks](index_gh
 - **High accuracy variant**: Yes
 - **Submission Category**: Datacenter, Edge
 
-### 5.3. Mixed Tasks (Question Answering, Math, and Code Generation)
+### Mixed Tasks (Question Answering, Math, and Code Generation)
 
-### [Mixtral-8x7B](benchmarks/language/mixtral-8x7b.md)
+#### [Mixtral-8x7B](benchmarks/language/mixtral-8x7b.md)
 - **Datasets**:
-  - OpenORCA (5k samples of GPT-4 split, max_seq_len=2048)
-  - GSM8K (5k samples of the validation split, max_seq_len=2048)
-  - MBXP (5k samples of the validation split, max_seq_len=2048)
+    - OpenORCA (5k samples of GPT-4 split, max_seq_len=2048)
+    - GSM8K (5k samples of the validation split, max_seq_len=2048)
+    - MBXP (5k samples of the validation split, max_seq_len=2048)
   - **Dataset Size**: 15,000
   - **QSL Size**: 15,000
 - **Number of Parameters**: 47 billion <!-- https://huggingface.co/blog/moe -->
@@ -135,8 +135,8 @@ This document provides details on various [MLPerf Inference Benchmarks](index_gh
 
 ---
 
-## 6. Recommendation
-### [DLRMv2](benchmarks/recommendation/dlrm-v2.md)
+## Recommendation
+### [DLRM_v2](benchmarks/recommendation/dlrm-v2.md)
 - **Dataset**: Synthetic Multihot Criteo
   - **Dataset Size**: 204,800
   - **QSL Size**: 204,800
@@ -149,10 +149,10 @@ This document provides details on various [MLPerf Inference Benchmarks](index_gh
 
 ---
 
-### Submission Categories
+## Submission Categories
 - **Datacenter Category**: All the current inference benchmarks are applicable to the datacenter category.
 - **Edge Category**: All benchmarks except DLRMv2, LLAMA2-70B, and Mixtral-8x7B are applicable to the edge category.
 
-### High Accuracy Variants
+## High Accuracy Variants
 - **Benchmarks**: `bert`, `llama2-70b`, `gpt-j`,  `dlrm_v2`, and `3d-unet` have a normal accuracy variant as well as a high accuracy variant.
 - **Requirement**: Must achieve at least 99.9% of the reference model accuracy, compared to the default 99% accuracy requirement.

From b956c6d09697ce8e4174219445b47bb6b9e493ee Mon Sep 17 00:00:00 2001
From: Arjun Suresh <arjunsuresh1987@gmail.com>
Date: Thu, 22 Aug 2024 19:51:37 +0100
Subject: [PATCH 19/27] Some cleanups for benchmark info

---
 docs/index.md | 59 +++++++++++++++++++++++++++++----------------------
 1 file changed, 34 insertions(+), 25 deletions(-)

diff --git a/docs/index.md b/docs/index.md
index 289ea509a..580a4336d 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -8,8 +8,8 @@ The currently valid [MLPerf Inference Benchmarks](index_gh.md) as of MLPerf infe
 ## Image Classification
 ### [ResNet50-v1.5](benchmarks/image_classification/resnet50.md)
 - **Dataset**: Imagenet-2012 (224x224) Validation
-  - **Dataset Size**: 50,000
-  - **QSL Size**: 1,024
+    - **Dataset Size**: 50,000
+    - **QSL Size**: 1,024
 - **Number of Parameters**: 25.6 million
 - **FLOPs**: 3.8 billion
 - **Reference Model Accuracy**: 76.46% ACC
@@ -23,8 +23,8 @@ The currently valid [MLPerf Inference Benchmarks](index_gh.md) as of MLPerf infe
 ## Text to Image
 ### [Stable Diffusion](benchmarks/text_to_image/sdxl.md)
 - **Dataset**: Subset of Coco2014
-  - **Dataset Size**: 5,000
-  - **QSL Size**: 5,000
+    - **Dataset Size**: 5,000
+    - **QSL Size**: 5,000
 - **Number of Parameters**: 3.5 billion <!-- taken from https://stability.ai/news/stable-diffusion-sdxl-1-announcement -->
 - **FLOPs**: 1.28 - 2.4 trillion
 - **Required Accuracy (Closed Division)**:
@@ -39,8 +39,8 @@ The currently valid [MLPerf Inference Benchmarks](index_gh.md) as of MLPerf infe
 ## Object Detection
 ### [Retinanet](benchmarks/object_detection/retinanet.md)
 - **Dataset**: OpenImages
-  - **Dataset Size**: 24,781
-  - **QSL Size**: 64
+    - **Dataset Size**: 24,781
+    - **QSL Size**: 64
 - **Number of Parameters**: TBD
 - **Reference Model Accuracy (fp32) **: 0.3755 mAP
 - **Server Scenario Latency Constraint**: 100ms
@@ -53,8 +53,8 @@ The currently valid [MLPerf Inference Benchmarks](index_gh.md) as of MLPerf infe
 ## Medical Image Segmentation
 ### [3d-unet](benchmarks/medical_imaging/3d-unet.md) <!-- https://ar5iv.labs.arxiv.org/html/1809.10483v2 -->
 - **Dataset**: KiTS2019
-  - **Dataset Size**: 42
-  - **QSL Size**: 42
+    - **Dataset Size**: 42
+    - **QSL Size**: 42
 - **Number of Parameters**: 32.5 million
 - **FLOPs**: 100-300 billion
 - **Reference Model Accuracy (fp32) **: 0.86330 Mean DICE Score
@@ -71,8 +71,8 @@ The currently valid [MLPerf Inference Benchmarks](index_gh.md) as of MLPerf infe
 
 #### [Bert-Large](benchmarks/language/bert.md)
 - **Dataset**: Squad v1.1 (384 Sequence Length)
-  - **Dataset Size**: 10,833
-  - **QSL Size**: 10,833
+    - **Dataset Size**: 10,833
+    - **QSL Size**: 10,833
 - **Number of Parameters**: 340 million <!-- taken from https://huggingface.co/transformers/v2.9.1/pretrained_models.html -->
 - **FLOPs**: ~128 billion
 - **Reference Model Accuracy (fp32) **: F1 Score = 90.874%
@@ -83,15 +83,18 @@ The currently valid [MLPerf Inference Benchmarks](index_gh.md) as of MLPerf infe
 
 #### [LLAMA2-70B](benchmarks/language/llama2-70b.md)
 - **Dataset**: OpenORCA (GPT-4 split, max_seq_len=1024)
-  - **Dataset Size**: 24,576
-  - **QSL Size**: 24,576
+    - **Dataset Size**: 24,576
+    - **QSL Size**: 24,576
 - **Number of Parameters**: 70 billion
 - **FLOPs**: ~500 trillion
 - **Reference Model Accuracy (fp16) **:
-  - Rouge1: 44.4312, Rouge2: 22.0352, RougeL: 28.6162, Tokens_per_sample: 294.45
+    - Rouge1: 44.4312
+    - Rouge2: 22.0352
+    - RougeL: 28.6162
+    - Tokens_per_sample: 294.45
 - **Server Scenario Latency Constraint**:
-  - TTFT: 2000ms
-  - TPOT: 200ms
+    - TTFT: 2000ms
+    - TPOT: 200ms
 - **Equal Issue mode**: True
 - **High accuracy variant**: Yes
 - **Submission Category**: Datacenter
@@ -100,12 +103,15 @@ The currently valid [MLPerf Inference Benchmarks](index_gh.md) as of MLPerf infe
 
 #### [GPT-J](benchmarks/language/gpt-j.md)
 - **Dataset**: CNN Daily Mail v3.0.0
-  - **Dataset Size**: 13,368
-  - **QSL Size**: 13,368
+    - **Dataset Size**: 13,368
+    - **QSL Size**: 13,368
 - **Number of Parameters**: 6 billion
 - **FLOPs**: ~148 billion
 - **Reference Model Accuracy (fp32) **:
-  - Rouge1: 42.9865, Rouge2: 20.1235, RougeL: 29.9881, Gen_len: 4,016,878
+    - Rouge1: 42.9865
+    - Rouge2: 20.1235
+    - RougeL: 29.9881
+    - Gen_len: 4,016,878
 - **Server Scenario Latency Constraint**: 20s
 - **Equal Issue mode**: True
 - **High accuracy variant**: Yes
@@ -118,17 +124,20 @@ The currently valid [MLPerf Inference Benchmarks](index_gh.md) as of MLPerf infe
     - OpenORCA (5k samples of GPT-4 split, max_seq_len=2048)
     - GSM8K (5k samples of the validation split, max_seq_len=2048)
     - MBXP (5k samples of the validation split, max_seq_len=2048)
-  - **Dataset Size**: 15,000
-  - **QSL Size**: 15,000
+    - **Dataset Size**: 15,000
+    - **QSL Size**: 15,000
 - **Number of Parameters**: 47 billion <!-- https://huggingface.co/blog/moe -->
 - **Reference Model Accuracy (fp16) **:
-  - Rouge1: 45.4911, Rouge2: 23.2829, RougeL: 30.3615
-  - GSM8K Accuracy: 73.78%
-  - MBXP Accuracy: 60.12%
+    - OpenORCA
+        - Rouge1: 45.4911
+        - Rouge2: 23.2829
+        - RougeL: 30.3615
+    - GSM8K Accuracy: 73.78%
+    - MBXP Accuracy: 60.12%
   - Tokens_per_sample: 294.45
 - **Server Scenario Latency Constraint**:
-  - TTFT: 2000ms
-  - TPOT: 200ms
+    - TTFT: 2000ms
+    - TPOT: 200ms
 - **Equal Issue mode**: True
 - **High accuracy variant**: Yes
 - **Submission Category**: Datacenter

From 2c493340316b6f2cd30d4840bbd3093d5a674a79 Mon Sep 17 00:00:00 2001
From: Arjun Suresh <arjunsuresh1987@gmail.com>
Date: Thu, 22 Aug 2024 19:52:15 +0100
Subject: [PATCH 20/27] Some cleanups for benchmark info

---
 docs/index.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/index.md b/docs/index.md
index 580a4336d..f5473afa7 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -147,8 +147,8 @@ The currently valid [MLPerf Inference Benchmarks](index_gh.md) as of MLPerf infe
 ## Recommendation
 ### [DLRM_v2](benchmarks/recommendation/dlrm-v2.md)
 - **Dataset**: Synthetic Multihot Criteo
-  - **Dataset Size**: 204,800
-  - **QSL Size**: 204,800
+    - **Dataset Size**: 204,800
+    - **QSL Size**: 204,800
 - **Number of Parameters**: ~23 billion
 - **Reference Model Accuracy**: AUC = 80.31%
 - **Server Scenario Latency Constraint**: 60ms

From 13db0f80be51bfca95dd5198773a16eae2ca1833 Mon Sep 17 00:00:00 2001
From: Arjun Suresh <arjunsuresh1987@gmail.com>
Date: Thu, 22 Aug 2024 20:47:34 +0100
Subject: [PATCH 21/27] Some cleanups for benchmark info (FID and CLIP data
 added)

---
 docs/index.md | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/docs/index.md b/docs/index.md
index f5473afa7..11f2a52c2 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -1,7 +1,7 @@
 # MLPerf Inference Benchmarks
 
 ## Overview
-The currently valid [MLPerf Inference Benchmarks](index_gh.md) as of MLPerf inference v4.0 round are listed below, categorized by tasks. Under each model you can find its details like the dataset used, accuracy, server latency constraints etc.
+The currently valid [MLPerf Inference Benchmarks](index_gh.md) as of MLPerf inference v4.0 round are listed below, categorized by tasks. Under each model you can find its details like the dataset used, reference accuracy, server latency constraints etc.
 
 ---
 
@@ -27,9 +27,10 @@ The currently valid [MLPerf Inference Benchmarks](index_gh.md) as of MLPerf infe
     - **QSL Size**: 5,000
 - **Number of Parameters**: 3.5 billion <!-- taken from https://stability.ai/news/stable-diffusion-sdxl-1-announcement -->
 - **FLOPs**: 1.28 - 2.4 trillion
+- **Reference Model Accuracy (fp32)**:  CLIP: 31.74981837, FID: 23.48046692
 - **Required Accuracy (Closed Division)**:
-    - FID: 23.01085758 ≤ FID ≤ 23.95007626
-    - CLIP: 32.68631873 ≤ CLIP ≤ 31.81331801
+    - CLIP: 31.68631873 ≤ CLIP ≤ 31.81331801 (within 0.2% of the reference model CLIP score)
+    - FID: 23.01085758 ≤ FID ≤ 23.95007626 (within 2% of the reference model FID score)
 - **Equal Issue mode**: False
 - **High accuracy variant**: No
 - **Submission Category**: Datacenter, Edge
@@ -87,7 +88,7 @@ The currently valid [MLPerf Inference Benchmarks](index_gh.md) as of MLPerf infe
     - **QSL Size**: 24,576
 - **Number of Parameters**: 70 billion
 - **FLOPs**: ~500 trillion
-- **Reference Model Accuracy (fp16) **:
+- **Reference Model Accuracy (fp32) **:
     - Rouge1: 44.4312
     - Rouge2: 22.0352
     - RougeL: 28.6162

From 4eefc940af31f2e4353448725d115a5829235f8b Mon Sep 17 00:00:00 2001
From: anandhu-eng <anandhukicks@gmail.com>
Date: Fri, 23 Aug 2024 16:54:39 +0530
Subject: [PATCH 22/27] typo fix for bert deepsparse framework

---
 main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.py b/main.py
index 73690b1ec..958f6d90b 100644
--- a/main.py
+++ b/main.py
@@ -24,7 +24,7 @@ def mlperf_inference_implementation_readme(spaces, model, implementation):
             elif model.lower() == "retinanet":
                  frameworks = [ "Onnxruntime", "Pytorch" ]
             elif "bert" in model.lower():
-                 frameworks = [ "Pytorch", "deepsparse" ]
+                 frameworks = [ "Pytorch", "Deepsparse" ]
             else:
                  frameworks = [ "Pytorch" ]
 

From e6abadd56d29005a622cbd5428f97ff10aa8bbd4 Mon Sep 17 00:00:00 2001
From: anandhu-eng <anandhukicks@gmail.com>
Date: Fri, 23 Aug 2024 18:34:22 +0530
Subject: [PATCH 23/27] added min system requirements for models

---
 main.py | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/main.py b/main.py
index 958f6d90b..d8f4ba847 100644
--- a/main.py
+++ b/main.py
@@ -104,6 +104,9 @@ def mlperf_inference_implementation_readme(spaces, model, implementation):
                     content += f"{cur_space1}=== \"{device}\"\n"
                     content += f"{cur_space2}##### {device} device\n\n"
 
+                    # minimum system requirements
+                    content += get_min_system_requirements(cur_space2, model, implementation, device)
+
                     # to select the execution environments(currently Docker and Native)
                     for execution_env in execution_envs:
                         if (device == "ROCm" or implementation == "qualcomm") and execution_env == "Docker":
@@ -213,6 +216,55 @@ def get_test_query_count(model, implementation, device, num_devices=1):
             p_range *= num_devices
 
         return p_range
+    
+    def get_min_system_requirements(spaces, model, implementation, device):
+        model = model.lower()
+        min_sys_req_content = ""
+        min_sys_req_content += f"{spaces}<details>\n"
+        min_sys_req_content += f"{spaces}<summary>Please click here to see the minimum system requirements for running the benchmark</summary>\n\n"
+        # device memory
+        if device.lower() == "cuda" and (implementation.lower() == "nvidia" or implementation.lower() == "reference"):
+            if implementation.lower() == "nvidia":
+                if "dlrm" in model:
+                    device_memory = "24GB"
+                elif "llama2-70b" in model or "mixtral" in model:
+                    device_memory = "80GB"
+                elif "sdxl" in model or "gptj" in model:
+                    device_memory = "16GB"
+                else:
+                    device_memory = "8GB"
+            elif implementation.lower() == "reference":
+                if "dlrm" in model:
+                    device_memory = "2x80GB"
+                elif "llama2-70b" in model:
+                    device_memory = "8x80GB"
+                elif "mixtral" in model:
+                    device_memory = "4x80GB"
+                elif "sdxl" in model:
+                    device_memory = "24GB(fp32), 16GB(fp16)"
+                elif "gptj" in model:
+                    device_memory = "80GB(fp32). 40GB(fp16)"
+                else:
+                    device_memory = "8GB"
+            min_sys_req_content += f"{spaces}* **Device Memory**: {device_memory}\n\n"
+        # disk space
+        if "dlrm" in model:
+            disk_space = "500GB"
+        elif "llama2-70b" in model:
+            disk_space = "700GB"
+        elif "mixtral" in model:
+            disk_space = "100GB"
+        elif "retinanet" in model:
+            disk_space = "200GB"
+        else:
+            disk_space = "50GB"
+        min_sys_req_content += f"{spaces}* **Disk Space**: {disk_space}\n\n"
+        # System memory
+        if "dlrm" in model:
+            system_memory = "512GB"
+            min_sys_req_content += f"{spaces}* **System Memory(RAM+SWAP)**: {system_memory}\n\n"
+        min_sys_req_content += f"{spaces}</details>\n"
+        return min_sys_req_content
 
     def get_readme_prefix(spaces, model, implementation):
         readme_prefix = ""

From d994a863d793114d2018082d4208319eabe5e39f Mon Sep 17 00:00:00 2001
From: anandhu-eng <anandhukicks@gmail.com>
Date: Tue, 3 Sep 2024 15:44:18 +0530
Subject: [PATCH 24/27] fixed code version

---
 main.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)
 mode change 100644 => 100755 main.py

diff --git a/main.py b/main.py
old mode 100644
new mode 100755
index d8f4ba847..367bf7178
--- a/main.py
+++ b/main.py
@@ -12,7 +12,7 @@ def mlperf_inference_implementation_readme(spaces, model, implementation):
         content=""
         scenarios = []
         execution_envs = ["Docker","Native"]
-        code_version="r4.1"
+        code_version="r4.1-dev"
 
         if model == "rnnt":
             code_version="r4.0"
@@ -140,7 +140,7 @@ def mlperf_inference_implementation_readme(spaces, model, implementation):
                                 content += f"{cur_space3}The above command should get you to an interactive shell inside the docker container and do a quick test run for the Offline scenario. Once inside the docker container please do the below commands to do the accuracy + performance runs for each scenario.\n\n"
                                 content += f"{cur_space3}<details>\n"
                                 content += f"{cur_space3}<summary> Please click here to see more options for the docker launch </summary>\n\n"
-                                content += f"{cur_space3}* `--docker_cm_repo <Custom CM repo URL>`: to use a custom fork of cm4mlops repository inside the docker image\n\n"
+                                content += f"{cur_space3}* `--docker_cm_repo=<Custom CM repo URL>`: to use a custom fork of cm4mlops repository inside the docker image\n\n"
                                 content += f"{cur_space3}* `--docker_cache=no`: to not use docker cache during the image build\n"
 
                                 if device.lower() not in [ "cuda" ]:
@@ -337,7 +337,7 @@ def get_run_cmd_extra(f_pre_space, model, implementation, device, scenario, scen
         return extra_content
 
     @env.macro
-    def mlperf_inference_run_command(spaces, model, implementation, framework, category, scenario, device="cpu", execution_mode="test", test_query_count="20", docker=False, scenarios = [], code_version="r4.1"):
+    def mlperf_inference_run_command(spaces, model, implementation, framework, category, scenario, device="cpu", execution_mode="test", test_query_count="20", docker=False, scenarios = [], code_version="r4.1-dev"):
         pre_space = ""
         for i in range(1,spaces):
              pre_space  = pre_space + " "

From fd8945ce66f69b15291e39f63d0ba5feb1550380 Mon Sep 17 00:00:00 2001
From: anandhu-eng <anandhukicks@gmail.com>
Date: Tue, 3 Sep 2024 16:31:49 +0530
Subject: [PATCH 25/27] changes for displaying reference and intel
 implementation tip

---
 docs/benchmarks/language/bert.md           |  9 +--------
 docs/benchmarks/language/gpt-j.md          |  8 +-------
 docs/benchmarks/language/llama2-70b.md     |  6 ------
 docs/benchmarks/language/mixtral-8x7b.md   |  1 -
 docs/benchmarks/medical_imaging/3d-unet.md |  6 ------
 docs/benchmarks/recommendation/dlrm-v2.md  |  8 +-------
 main.py                                    | 12 ++++++++++++
 7 files changed, 15 insertions(+), 35 deletions(-)

diff --git a/docs/benchmarks/language/bert.md b/docs/benchmarks/language/bert.md
index cb9a955f1..57b1cf224 100644
--- a/docs/benchmarks/language/bert.md
+++ b/docs/benchmarks/language/bert.md
@@ -8,34 +8,27 @@ hide:
 === "MLCommons-Python"
     ## MLPerf Reference Implementation in Python
     
-    BERT-99
 {{ mlperf_inference_implementation_readme (4, "bert-99", "reference") }}
 
-    BERT-99.9
 {{ mlperf_inference_implementation_readme (4, "bert-99.9", "reference") }}
 
 === "Nvidia"
     ## Nvidia MLPerf Implementation
     
-    BERT-99
 {{ mlperf_inference_implementation_readme (4, "bert-99", "nvidia") }}
 
-    BERT-99.9
 {{ mlperf_inference_implementation_readme (4, "bert-99.9", "nvidia") }}
 
 === "Intel"
     ## Intel MLPerf Implementation
-    BERT-99
+    
 {{ mlperf_inference_implementation_readme (4, "bert-99", "intel") }}
 
-    BERT-99.9
 {{ mlperf_inference_implementation_readme (4, "bert-99.9", "intel") }}
 
 === "Qualcomm"
     ## Qualcomm AI100 MLPerf Implementation
 
-    BERT-99
 {{ mlperf_inference_implementation_readme (4, "bert-99", "qualcomm") }}
 
-    BERT-99.9
 {{ mlperf_inference_implementation_readme (4, "bert-99.9", "qualcomm") }}
diff --git a/docs/benchmarks/language/gpt-j.md b/docs/benchmarks/language/gpt-j.md
index 125c87ef8..4dcb3d70e 100644
--- a/docs/benchmarks/language/gpt-j.md
+++ b/docs/benchmarks/language/gpt-j.md
@@ -8,29 +8,24 @@ hide:
 
 === "MLCommons-Python"
     ## MLPerf Reference Implementation in Python
-    
-    GPT-J-99
+
 
 {{ mlperf_inference_implementation_readme (4, "gptj-99", "reference") }}
 
-    GPTJ-99.9
 
 {{ mlperf_inference_implementation_readme (4, "gptj-99.9", "reference") }}
 
 === "Nvidia"
     ## Nvidia MLPerf Implementation
     
-    GPTJ-99
 
 {{ mlperf_inference_implementation_readme (4, "gptj-99", "nvidia") }}
 
-    GPTJ-99.9
 
 {{ mlperf_inference_implementation_readme (4, "gptj-99.9", "nvidia") }}
 
 === "Intel"
     ## Intel MLPerf Implementation
-    GPTJ-99
 
 {{ mlperf_inference_implementation_readme (4, "gptj-99", "intel") }}
 
@@ -38,7 +33,6 @@ hide:
 === "Qualcomm"
     ## Qualcomm AI100 MLPerf Implementation
 
-    GPTJ-99
 
 {{ mlperf_inference_implementation_readme (4, "gptj-99", "qualcomm") }}
 
diff --git a/docs/benchmarks/language/llama2-70b.md b/docs/benchmarks/language/llama2-70b.md
index 87b637b42..0d9a0504d 100644
--- a/docs/benchmarks/language/llama2-70b.md
+++ b/docs/benchmarks/language/llama2-70b.md
@@ -9,26 +9,20 @@ hide:
 === "MLCommons-Python"
     ## MLPerf Reference Implementation in Python
     
-    LLAMA2-70b-99
 {{ mlperf_inference_implementation_readme (4, "llama2-70b-99", "reference") }}
 
-    LLAMA2-70b-99.9
 {{ mlperf_inference_implementation_readme (4, "llama2-70b-99.9", "reference") }}
 
 === "Nvidia"
     ## Nvidia MLPerf Implementation
     
-    LLAMA2-70b-99
 {{ mlperf_inference_implementation_readme (4, "llama2-70b-99", "nvidia") }}
 
-    LLAMA2-70b-99.9
 {{ mlperf_inference_implementation_readme (4, "llama2-70b-99.9", "nvidia") }}
 
 === "Neural Magic"
     ## Neural Magic MLPerf Implementation
     
-    LLAMA2-70b-99
 {{ mlperf_inference_implementation_readme (4, "llama2-70b-99", "neuralmagic") }}
 
-    LLAMA2-70b-99.9
 {{ mlperf_inference_implementation_readme (4, "llama2-70b-99.9", "neuralmagic") }}
\ No newline at end of file
diff --git a/docs/benchmarks/language/mixtral-8x7b.md b/docs/benchmarks/language/mixtral-8x7b.md
index f4e84e8cf..9f3bf2992 100644
--- a/docs/benchmarks/language/mixtral-8x7b.md
+++ b/docs/benchmarks/language/mixtral-8x7b.md
@@ -6,5 +6,4 @@ hide:
 === "MLCommons-Python"
     ## MLPerf Reference Implementation in Python
     
-    MIXTRAL-8x7b
 {{ mlperf_inference_implementation_readme (4, "mixtral-8x7b", "reference") }}
\ No newline at end of file
diff --git a/docs/benchmarks/medical_imaging/3d-unet.md b/docs/benchmarks/medical_imaging/3d-unet.md
index 8d5017bf5..01a54c63e 100644
--- a/docs/benchmarks/medical_imaging/3d-unet.md
+++ b/docs/benchmarks/medical_imaging/3d-unet.md
@@ -9,30 +9,24 @@ hide:
 === "MLCommons-Python"
     ## MLPerf Reference Implementation in Python
 
-    3d-unet-99
 
 {{ mlperf_inference_implementation_readme (4, "3d-unet-99", "reference") }}
 
-    3d-unet-99.9
 
 {{ mlperf_inference_implementation_readme (4, "3d-unet-99.9", "reference") }}
 
 === "Nvidia"
     ## Nvidia MLPerf Implementation
-    3d-unet-99
 
 {{ mlperf_inference_implementation_readme (4, "3d-unet-99", "nvidia") }}
 
-    3d-unet-99.9
 
 {{ mlperf_inference_implementation_readme (4, "3d-unet-99.9", "nvidia") }}
 
 === "Intel"
     ## Intel MLPerf Implementation
-    3d-unet-99
 
 {{ mlperf_inference_implementation_readme (4, "3d-unet-99", "intel") }}
 
-    3d-unet-99.9
 
 {{ mlperf_inference_implementation_readme (4, "3d-unet-99.9", "intel") }}
diff --git a/docs/benchmarks/recommendation/dlrm-v2.md b/docs/benchmarks/recommendation/dlrm-v2.md
index d1b41deb1..ce3081077 100644
--- a/docs/benchmarks/recommendation/dlrm-v2.md
+++ b/docs/benchmarks/recommendation/dlrm-v2.md
@@ -10,26 +10,20 @@ hide:
 === "MLCommons-Python"
     ## MLPerf Reference Implementation in Python
 
-    DLRM-v2-99
 {{ mlperf_inference_implementation_readme (4, "dlrm-v2-99", "reference") }}
 
-    DLRM-v2-99.9
 {{ mlperf_inference_implementation_readme (4, "dlrm-v2-99.9", "reference") }}
 
 === "Nvidia"
     ## Nvidia MLPerf Implementation
-    
-    DLRM-v2-99
+
 {{ mlperf_inference_implementation_readme (4, "dlrm-v2-99", "nvidia") }}
 
-    DLRM-v2-99.9
 {{ mlperf_inference_implementation_readme (4, "dlrm-v2-99.9", "nvidia") }}
 
 === "Intel"
     ## Intel MLPerf Implementation
     
-    DLRM-v2-99
 {{ mlperf_inference_implementation_readme (4, "dlrm-v2-99", "intel") }}
 
-    DLRM-v2-99.9
 {{ mlperf_inference_implementation_readme (4, "dlrm-v2-99.9", "intel") }}
\ No newline at end of file
diff --git a/main.py b/main.py
index 367bf7178..19f2793d9 100755
--- a/main.py
+++ b/main.py
@@ -18,6 +18,11 @@ def mlperf_inference_implementation_readme(spaces, model, implementation):
             code_version="r4.0"
 
         if implementation == "reference":
+            # Tip
+            if "99.9" not in model:
+                content += f"\n{pre_space}!!! tip\n\n"
+                content += f"{pre_space}    - MLCommons reference implementations are only meant to provide a rules compliant reference implementation for the submitters and in most cases are not best performing. If you want to benchmark any system, it is advisable to use the vendor MLPerf implementation for that system like Nvidia, Intel etc.\n\n"
+              
             devices = [ "CPU", "CUDA", "ROCm" ]
             if model.lower() == "resnet50":
                  frameworks = [ "Onnxruntime", "Tensorflow", "Deepsparse" ]
@@ -39,6 +44,11 @@ def mlperf_inference_implementation_readme(spaces, model, implementation):
             frameworks = [ "pytorch" ]
 
         elif implementation == "intel":
+            # Tip
+            if "99.9" not in model:
+                content += f"\n{pre_space}!!! tip\n\n"
+                content += f"{pre_space}    - Intel MLPerf inference implementation is available only for datacenter category and has been tested only on a limited number of systems. Most of the benchmarks using Intel implementation require at least Intel Sapphire Rapids or higher CPU generation.\n\n"
+                                
             if model not in [ "bert-99", "bert-99.9", "gptj-99", "gptj-99.9", "resnet50", "retinanet", "3d-unet-99", "3d-unet-99.9", "dlrm-v2-99", "dlrm-v2-99.9", "sdxl" ]:
                  return pre_space+"    WIP"
             if model in [ "bert-99", "bert-99.9", "retinanet", "3d-unet-99", "3d-unet-99.9" ]:
@@ -72,6 +82,8 @@ def mlperf_inference_implementation_readme(spaces, model, implementation):
         else:
             categories = [ "Edge", "Datacenter" ]
 
+        # model name
+        content += f"{pre_space}{model.upper()}\n\n"
         for category in categories:
             if category == "Edge" and not scenarios:
                 scenarios = [ "Offline", "SingleStream" ]

From 8815065200982f4786ab957bcae53bfd0108a950 Mon Sep 17 00:00:00 2001
From: anandhu-eng <anandhukicks@gmail.com>
Date: Tue, 3 Sep 2024 16:46:36 +0530
Subject: [PATCH 26/27] added reference to installation page

---
 main.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/main.py b/main.py
index 19f2793d9..77f10560d 100755
--- a/main.py
+++ b/main.py
@@ -127,6 +127,8 @@ def mlperf_inference_implementation_readme(spaces, model, implementation):
                             continue  # Nvidia implementation only supports execution through docker
                         content += f"{cur_space2}=== \"{execution_env}\"\n"
                         content += f"{cur_space3}###### {execution_env} Environment\n\n"
+                        # ref to cm installation
+                        content += f"{cur_space3}Please refer to the [installation page](../../install/index.md) to install CM for running the automated benchmark commands.\n\n"
                         test_query_count=get_test_query_count(model, implementation, device)
 
                         if "99.9" not in model: #not showing docker command as it is already done for the 99% variant

From 8085e8b150829c73516545f901330fc7a3019b24 Mon Sep 17 00:00:00 2001
From: anandhu-eng <anandhukicks@gmail.com>
Date: Tue, 3 Sep 2024 18:33:06 +0530
Subject: [PATCH 27/27] updated neural magic documentation

---
 main.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/main.py b/main.py
index 77f10560d..c9e3e1b56 100755
--- a/main.py
+++ b/main.py
@@ -383,6 +383,7 @@ def mlperf_inference_run_command(spaces, model, implementation, framework, categ
                 elif implementation == "neuralmagic":
                     docker_cmd_suffix += f" \\\n{pre_space} --api_server=http://localhost:8000"
                     docker_cmd_suffix += f" \\\n{pre_space} --vllm_model_name=nm-testing/Llama-2-70b-chat-hf-FP8"
+                    docker_cmd_suffix += f" \\\n{pre_space} --adr.mlperf-implementation.tags=_repo.https://github.com/neuralmagic/inference,_branch.vllm"
             
             if "dlrm-v2" in model.lower() and implementation == "nvidia":
                 docker_cmd_suffix += f" \\\n{pre_space} --criteo_day23_raw_data_path=<PATH_TO_CRITEO_DAY23_RAW_DATA>"
@@ -415,6 +416,7 @@ def mlperf_inference_run_command(spaces, model, implementation, framework, categ
                 elif implementation == "neuralmagic":
                     cmd_suffix += f" \\\n{pre_space} --api_server=http://localhost:8000"
                     cmd_suffix += f" \\\n{pre_space} --vllm_model_name=nm-testing/Llama-2-70b-chat-hf-FP8"
+                    cmd_suffix += f" \\\n{pre_space} --adr.mlperf-implementation.tags=_repo.https://github.com/neuralmagic/inference,_branch.vllm"
             
             if "dlrm-v2" in model and implementation == "nvidia":
                 cmd_suffix += f" \\\n{pre_space} --criteo_day23_raw_data_path=<PATH_TO_CRITEO_DAY23_RAW_DATA>"