tenstorrent
diff --git a/‎README.md
Lines changed: 32 additions & 9 deletions b/‎README.md
Lines changed: 32 additions & 9 deletions
diff --git a/‎docs/README.md.in
Lines changed: 23 additions & 0 deletions b/‎docs/README.md.in
Lines changed: 23 additions & 0 deletions
diff --git a/‎tests/conftest.py
Lines changed: 65 additions & 0 deletions b/‎tests/conftest.py
Lines changed: 65 additions & 0 deletions
diff --git a/‎tests/models/bert/test_bert.py
Lines changed: 7 additions & 27 deletions b/‎tests/models/bert/test_bert.py
Lines changed: 7 additions & 27 deletions
diff --git a/‎tests/models/bloom/test_bloom.py
Lines changed: 7 additions & 25 deletions b/‎tests/models/bloom/test_bloom.py
Lines changed: 7 additions & 25 deletions
diff --git a/‎tests/models/falcon/test_falcon.py
Lines changed: 7 additions & 26 deletions b/‎tests/models/falcon/test_falcon.py
Lines changed: 7 additions & 26 deletions
@@ -10,15 +10,15 @@ The table below summarizes the results of running various ML models through our
 
 | Model                               | Run Success   | Torch Ops Before (Unique Ops)   | Torch Ops Remain (Unique Ops)   | To/From Device Ops   |   Original Run Time (ms) | Compiled Run Time (ms)   | Accuracy (%)   |
 |:------------------------------------|:--------------|:--------------------------------|:--------------------------------|:---------------------|-------------------------:|:-------------------------|:---------------|
-| [Mnist (Eval)](tests/models/mnist)  | ✘             | 14 (8)                          | 5 (4)                           | 12                   |                    11.04 | N/A                      | N/A            |
-| [Mnist (Train)](tests/models/mnist) | ✅            | 14 (8)                          | 7 (5)                           | 14                   |                    18.01 | 2922.51                  | 85.88          |
-| [ResNet18](tests/models/resnet)     | ✅            | 70 (9)                          | 42 (4)                          | 45                   |                  1772.4  | 8398.87                  | 99.99          |
-| [Bloom](tests/models/bloom)         | ✘             | 1407 (29)                       | N/A                             | N/A                  |                  5602.6  | N/A                      | N/A            |
-| [YOLOS](tests/models/yolos)         | ✘             | 964 (28)                        | N/A                             | N/A                  |                   209.04 | N/A                      | N/A            |
-| [Llama](tests/models/llama)         | ✘             | 3 (3)                           | 1 (1)                           | 5                    |                 38255.4  | N/A                      | N/A            |
-| [BERT](tests/models/bert)           | ✅            | 1393 (21)                       | 537 (4)                         | 1388                 |                 61919.4  | 52814.88                 | 98.64          |
-| [Falcon](tests/models/falcon)       | ✘             | 3 (3)                           | 1 (1)                           | 5                    |                 35014.3  | N/A                      | N/A            |
-| [GPT-2](tests/models/gpt2)          | ✘             | 748 (31)                        | N/A                             | N/A                  |                  1033.47 | N/A                      | N/A            |
+| [Mnist (Eval)](tests/models/mnist)  | ✘             | 14 (8)                          | 5 (4)                           | 16                   |                    36.12 | N/A                      | N/A            |
+| [Mnist (Train)](tests/models/mnist) | ✅            | 14 (8)                          | 7 (5)                           | 14                   |                   114.49 | 2742.8                   | 81.75          |
+| [ResNet18](tests/models/resnet)     | ✅            | 70 (9)                          | 42 (4)                          | 47                   |                  2094.6  | 10950.18                 | 99.99          |
+| [Bloom](tests/models/bloom)         | ✘             | 1407 (29)                       | N/A                             | N/A                  |                  9127.68 | N/A                      | N/A            |
+| [YOLOS](tests/models/yolos)         | ✘             | 964 (28)                        | N/A                             | N/A                  |                  1353.22 | N/A                      | N/A            |
+| [Llama](tests/models/llama)         | ✘             | 3 (3)                           | 1 (1)                           | 5                    |                 52926.3  | N/A                      | N/A            |
+| [BERT](tests/models/bert)           | ✅            | 1393 (21)                       | 537 (4)                         | 1607                 |                 65342    | 61028.65                 | 98.64          |
+| [Falcon](tests/models/falcon)       | ✘             | 3 (3)                           | 1 (1)                           | 5                    |                 47738.8  | N/A                      | N/A            |
+| [GPT-2](tests/models/gpt2)          | ✘             | 748 (31)                        | N/A                             | N/A                  |                  2287.61 | N/A                      | N/A            |
 
 ### Explanation of Metrics
 
@@ -173,3 +173,26 @@ PYTHONPATH=${TT_METAL_HOME}:$(pwd) python3 tools/run_transformers.py --model "ph
 
 You can also substitute the backend with `torch_stat` to run a reference comparison.
 
+# Add a model test
+If you want to record run time metrics for a model or test, include a Pytest fixture named `record_property` as a parameter and set the "model_name" key.  
+If you also want to compile the model with torch_ttnn backend, set the `torch_ttnn` key to a tuple in this order `(model, test_inputs, outputs)`. "model_name" still needs to be set. See the example code snippet below. Currently, only `torch.nn.Module` models with a `forward` function are supported.
+```
+def Model(torch.nn.Module):
+    def forward(self, x):
+        # ...
+        return outputs
+
+def test_model_name(record_property):
+    # Should be set as early as possible
+    record_property("model_name", "Model Name")
+
+    model = Model()
+    # ...
+    outputs = model(test_input)
+    # outputs = model(**test_inputs) # dictionary inputs are also supported
+    # ...
+
+    # Can be set once all three objects for the tuple are defined
+    record_property("torch_ttnn", (model, test_input(s), outputs))
+```
+
@@ -81,3 +81,26 @@ PYTHONPATH=${{TT_METAL_HOME}}:$(pwd) python3 tools/run_transformers.py --model "
 ```
 
 You can also substitute the backend with `torch_stat` to run a reference comparison.
+
+# Add a model test
+If you want to record run time metrics for a model or test, include a Pytest fixture named `record_property` as a parameter and set the "model_name" key.  
+If you also want to compile the model with torch_ttnn backend, set the `torch_ttnn` key to a tuple in this order `(model, test_inputs, outputs)`. "model_name" still needs to be set. See the example code snippet below. Currently, only `torch.nn.Module` models with a `forward` function are supported.
+```
+def Model(torch.nn.Module):
+    def forward(self, x):
+        # ...
+        return outputs
+
+def test_model_name(record_property):
+    # Should be set as early as possible
+    record_property("model_name", "Model Name")
+
+    model = Model()
+    # ...
+    outputs = model(test_input)
+    # outputs = model(**test_inputs) # dictionary inputs are also supported
+    # ...
+
+    # Can be set once all three objects for the tuple are defined
+    record_property("torch_ttnn", (model, test_input(s), outputs))
+```
@@ -1,6 +1,13 @@
 import pytest
 import ttnn
 import torch
+import torch_ttnn
+import collections
+from tests.utils import calculate_accuracy
+import time
+from pathlib import Path
+import os
+import pickle
 
 
 @pytest.fixture(scope="session")
@@ -15,3 +22,61 @@ def reset_torch_dynamo():
     # PyTorch caches models. Start a fresh compile for each parameter of the test case.
     torch._dynamo.reset()
     yield
+
+
+@pytest.fixture(autouse=True)
+def compile_and_run(device, reset_torch_dynamo, request):
+    try:
+        start = time.perf_counter() * 1000
+        yield
+        end = time.perf_counter() * 1000
+        runtime_metrics = {"success": True, "run_time": round(end - start, 2)}
+    except Exception as e:
+        runtime_metrics = {"success": False}
+        print(f"{model_name} original failed to run. Raised exception: {e}")
+        raise
+    finally:
+        record = dict(request.node.user_properties)
+        model_path = Path(request.node.location[0])
+        runtime_metrics["model_path"] = str(model_path.parent)
+        if "model_name" in record:
+            model_name = record["model_name"]
+            p = Path(f"metrics/{model_name}")
+            os.makedirs(p, exist_ok=True)
+
+            original_metrics_path = p / f"original-run_time_metrics.pickle"
+            with open(original_metrics_path, "wb") as f:
+                pickle.dump(runtime_metrics, f)
+
+    if "torch_ttnn" in record:
+        model, inputs, outputs = record["torch_ttnn"]
+        try:
+            # check that model contains a forward function
+            assert "forward" in dir(model), f"forward() not implemented in {model_name}"
+            # Compile model with ttnn backend
+            option = torch_ttnn.TorchTtnnOption(
+                device=device, gen_graphviz=True, metrics_path=model_name
+            )
+            m = torch.compile(model, backend=torch_ttnn.backend, options=option)
+
+            start = time.perf_counter() * 1000
+            if isinstance(inputs, collections.Mapping):
+                outputs_after = m(**inputs)
+            elif isinstance(inputs, collections.Sequence):
+                outputs_after = m(*inputs)
+            else:
+                outputs_after = m(inputs)
+            end = time.perf_counter() * 1000
+            comp_runtime_metrics = {"success": True, "run_time": round(end - start, 2)}
+            option._out_fx_graphs[0].print_tabular()
+            accuracy = calculate_accuracy(outputs, outputs_after)
+            if accuracy:
+                comp_runtime_metrics["accuracy"] = accuracy
+        except Exception as e:
+            comp_runtime_metrics = {"success": False}
+            print(f"{model_name} compiled failed to run. Raised exception: {e}")
+            raise
+        finally:
+            compiled_metrics_path = p / f"compiled-run_time_metrics.pickle"
+            with open(compiled_metrics_path, "wb") as f:
+                pickle.dump(comp_runtime_metrics, f)
@@ -1,13 +1,12 @@
 import torch
-import torch_ttnn
-import pytest
-from torch_ttnn.metrics import RunTimeMetrics
 
 # Load model directly
 from transformers import AutoTokenizer, AutoModelForQuestionAnswering
 
 
-def test_bert(device):
+def test_bert(record_property):
+    record_property("model_name", "BERT")
+
     # Download model from cloud
     model_name = "phiyodr/bert-large-finetuned-squad2"
     tokenizer = AutoTokenizer.from_pretrained(
@@ -32,10 +31,9 @@ def test_bert(device):
         truncation=True,
     )
 
-    metrics_path = "BERT"
     # Run inference with the original model
     with torch.no_grad():
-        outputs_before = RunTimeMetrics(metrics_path, "original", lambda: m(**inputs))
+        outputs = m(**inputs)
 
     # Helper function to decode output to human-readable text
     def decode_output(outputs):
@@ -44,34 +42,16 @@ def decode_output(outputs):
         response_tokens = inputs.input_ids[0, response_start:response_end]
         return tokenizer.decode(response_tokens)
 
-    answer_before = decode_output(outputs_before)
-
-    # Compile model with ttnn backend
-    option = torch_ttnn.TorchTtnnOption(
-        device=device, gen_graphviz=True, metrics_path=metrics_path
-    )
-    m = torch.compile(m, backend=torch_ttnn.backend, options=option)
-
-    # Run inference with the compiled model
-    with torch.no_grad():
-        outputs_after = RunTimeMetrics(metrics_path, "compiled", lambda: m(**inputs))
-
-    option._out_fx_graphs[0].print_tabular()
-
-    answer_after = decode_output(outputs_after)
+    answer = decode_output(outputs)
 
     print(
         f"""
     model_name: {model_name}
     input:
         context: {context}
         question: {question}
-    answer before: {answer_before}
-    answer after: {answer_after}
+    answer: {answer}
     """
     )
 
-    # TODO: Add more checks for the compiled graph
-
-    # Check inference result
-    assert answer_before == answer_after
+    record_property("torch_ttnn", (m, inputs, outputs))
@@ -1,14 +1,14 @@
 import torch
-import torch_ttnn
 import pytest
-from torch_ttnn.metrics import RunTimeMetrics
 
 # Load model directly
 from transformers import AutoTokenizer, AutoModelForCausalLM
 
 
 @pytest.mark.xfail
-def test_bloom(device):
+def test_bloom(record_property):
+    record_property("model_name", "Bloom")
+
     # Download model from cloud
     model_name = "bigscience/bloom-1b1"
     tokenizer = AutoTokenizer.from_pretrained(
@@ -21,42 +21,24 @@ def test_bloom(device):
     test_input = "This is a sample text from "
     inputs = tokenizer(test_input, return_tensors="pt")
 
-    metrics_path = "Bloom"
     # Run inference with the original model
     with torch.no_grad():
-        outputs_before = RunTimeMetrics(metrics_path, "original", lambda: m(**inputs))
+        outputs = m(**inputs)
 
     # Helper function to decode output to human-readable text
     def decode_output(outputs):
         next_token_logits = outputs.logits[:, -1]
         next_token = next_token_logits.softmax(dim=-1).argmax()
         return tokenizer.decode([next_token])
 
-    decoded_output_before = decode_output(outputs_before)
-
-    # Compile model with ttnn backend
-    option = torch_ttnn.TorchTtnnOption(
-        device=device, gen_graphviz=True, metrics_path=metrics_path
-    )
-    m = torch.compile(m, backend=torch_ttnn.backend, options=option)
-
-    # Run inference with the compiled model
-    with torch.no_grad():
-        outputs_after = RunTimeMetrics(metrics_path, "compiled", lambda: m(**inputs))
-    option._out_fx_graphs[0].print_tabular()
-
-    decoded_output_after = decode_output(outputs_after)
+    decoded_output = decode_output(outputs)
 
     print(
         f"""
     model_name: {model_name}
     input: {test_input}
-    output before: {decoded_output_before}
-    output after: {decoded_output_after}
+    output: {decoded_output}
     """
     )
 
-    # TODO: Add more checks for the compiled graph
-
-    # Check inference result
-    assert decoded_output_before == decoded_output_after
+    record_property("torch_ttnn", (m, inputs, outputs))
@@ -1,14 +1,14 @@
 import torch
-import torch_ttnn
 import pytest
-from torch_ttnn.metrics import RunTimeMetrics
 
 # Load model directly
 from transformers import AutoTokenizer, AutoModelForCausalLM
 
 
 @pytest.mark.xfail
-def test_falcon(device):
+def test_falcon(record_property):
+    record_property("model_name", "Falcon")
+
     # Download model from cloud
     model_name = "tiiuae/falcon-7b-instruct"
     tokenizer = AutoTokenizer.from_pretrained(
@@ -21,43 +21,24 @@ def test_falcon(device):
     test_input = "This is a sample text from "
     inputs = tokenizer(test_input, return_tensors="pt")
 
-    metrics_path = "Falcon"
     # Run inference with the original model
     with torch.no_grad():
-        outputs_before = RunTimeMetrics(metrics_path, "original", lambda: m(**inputs))
+        outputs = m(**inputs)
 
     # Helper function to decode output to human-readable text
     def decode_output(outputs):
         next_token_logits = outputs.logits[:, -1]
         next_token = next_token_logits.softmax(dim=-1).argmax()
         return tokenizer.decode([next_token])
 
-    decoded_output_before = decode_output(outputs_before)
-
-    # Compile model with ttnn backend
-    option = torch_ttnn.TorchTtnnOption(
-        device=device, gen_graphviz=True, metrics_path=metrics_path
-    )
-    m = torch.compile(m, backend=torch_ttnn.backend, options=option)
-
-    # Run inference with the compiled model
-    with torch.no_grad():
-        outputs_after = RunTimeMetrics(metrics_path, "compiled", lambda: m(**inputs))
-
-    option._out_fx_graphs[0].print_tabular()
-
-    decoded_output_after = decode_output(outputs_after)
+    decoded_output = decode_output(outputs)
 
     print(
         f"""
     model_name: {model_name}
     input: {test_input}
-    output before: {decoded_output_before}
-    output after: {decoded_output_after}
+    output before: {decoded_output}
     """
     )
 
-    # TODO: Add more checks for the compiled graph
-
-    # Check inference result
-    assert decoded_output_before == decoded_output_after
+    record_property("torch_ttnn", (m, inputs, outputs))