nod-ai
diff --git a/‎sharktank/sharktank/examples/export_paged_llm_v1.py
+5-1 b/‎sharktank/sharktank/examples/export_paged_llm_v1.py
+5-1
diff --git a/‎sharktank/sharktank/examples/paged_llm_v1.py
+6-1 b/‎sharktank/sharktank/examples/paged_llm_v1.py
+6-1
diff --git a/‎sharktank/sharktank/examples/validate_direct_mixtral_model.py
+144 b/‎sharktank/sharktank/examples/validate_direct_mixtral_model.py
+144
diff --git a/‎sharktank/sharktank/examples/validate_mixtral_ref_model.py
+48 b/‎sharktank/sharktank/examples/validate_mixtral_ref_model.py
+48
diff --git a/‎sharktank/sharktank/layers/__init__.py
+5 b/‎sharktank/sharktank/layers/__init__.py
+5
diff --git a/‎sharktank/sharktank/layers/base.py
+1-4 b/‎sharktank/sharktank/layers/base.py
+1-4
diff --git a/‎sharktank/sharktank/layers/configs/llm_configs.py
+26-6 b/‎sharktank/sharktank/layers/configs/llm_configs.py
+26-6
diff --git a/‎sharktank/sharktank/layers/ffn_block.py
+38 b/‎sharktank/sharktank/layers/ffn_block.py
+38
@@ -16,6 +16,7 @@
 
 # TODO: Should be using a base class with the protocol supported.
 from ..models.llama.llama import LlamaModelConfig, PagedLlamaModelV1
+from ..models.mixtral.mixtral import *
 
 
 def main():
@@ -52,7 +53,10 @@ def main():
     llama_config = LlamaModelConfig(hp)
     llama_config.static_tables = False  # Rely on the compiler for hoisting tables.
     llama_config.kv_cache_type = "direct" if args.bs == [1] else "paged"
-    model = PagedLlamaModelV1(dataset.root_theta, llama_config)
+    if llama_config.hp.expert_count:
+        model = PagedMixtralModelV1(dataset.root_theta, llama_config)
+    else:
+        model = PagedLlamaModelV1(dataset.root_theta, llama_config)
 
     def generate_params_json(hp, prefill_bs: list[int], decode_bs: list[int]):
         return {
 
@@ -17,6 +17,7 @@
 from ..types import *
 
 # TODO: Should be using a base class with the protocol supported.
+from ..models.mixtral.mixtral import *
 from ..models.llama.llama import *
 from ..utils.debugging import trace_tensor
 from ..utils.tokenizer import InferenceTokenizer, load_tokenizer
@@ -236,7 +237,11 @@ def main():
         activation_dtype=activation_dtype,
         attention_dtype=activation_dtype,
     )
-    model = PagedLlamaModelV1(dataset.root_theta, config)
+
+    if config.hp.expert_count:
+        model = PagedMixtralModelV1(dataset.root_theta, config)
+    else:
+        model = PagedLlamaModelV1(dataset.root_theta, config)
     if args.save_intermediates_path:
         from ..utils.patching import SaveModuleResultTensorsPatch
 
 
@@ -0,0 +1,144 @@
+# Copyright 2024 Advanced Micro Devices, Inc
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import sys
+
+import torch
+
+from sharktank.layers import *
+from sharktank.types import *
+from sharktank.models.mixtral.mixtral import *
+
+
+def main(args: list[str]):
+    from ..utils import cli
+
+    torch.no_grad().__enter__()
+
+    parser = cli.create_parser()
+    cli.add_input_dataset_options(parser)
+    args = cli.parse(parser)
+
+    dataset = cli.get_input_dataset(args)
+    hp = configs.LlamaHParams.from_gguf_props(dataset.properties)
+    llama_config = LlamaModelConfig(hp)
+    llama_config.kv_cache_type = "direct"
+    llama_config.activation_dtype = torch.float16
+    model = PagedMixtralModelV1(dataset.root_theta, llama_config)
+
+    # bs ("batch size") == 1
+    cache_state = model.cache.allocate(bs=1)
+
+    start_index = 0
+    tokens = torch.tensor(
+        [
+            [
+                1,
+                1059,
+                31871,
+                1217,
+                322,
+                266,
+                3682,
+                6075,
+                31902,
+                13,
+                31849,
+                31871,
+                0,
+                0,
+                0,
+                0,
+            ]
+            + 48 * [0],
+        ]
+    )
+    assert tokens.shape[1] % model.cache.block_seq_stride == 0
+    seq_block_ids = torch.tensor(
+        [
+            [127, 0, 0, 0],
+        ]
+    )
+
+    # Important: Do not use a sequence length of 0 for empty batch slots
+    # as it will cause softmax to nan due to a mask of all -inf. This then
+    # propagates and causes badness.
+    seq_lens = torch.tensor([12])
+
+    attention_mask = model.attention_mask(
+        model.input_mask(seq_lens, tokens.shape[1]),
+    )
+
+    print(f"Step {start_index}")
+    logits = model.prefill(
+        tokens,
+        attention_mask=attention_mask,
+        seq_block_ids=seq_block_ids,
+        cache_state=cache_state,
+    )
+    # TODO: Normalize the output of extract_tokens_from_logits into tensor [bs, 1].
+    tokens = torch.tensor(model.extract_tokens_from_logits(logits, seq_lens)).unsqueeze(
+        1
+    )
+    print(f"  : tokens = {tokens}")
+
+    # Decode a step.
+    print("Decoding...")
+    print(tokens.shape, tokens)
+    start_positions = torch.tensor([12])
+    seq_lens = seq_lens + 1
+    decode_attention_mask = model.decode_attention_mask(
+        model.input_mask(
+            seq_lens,
+            seq_block_ids.shape[1] * model.cache.block_seq_stride,
+        ),
+    )
+    logits = model.decode(
+        tokens,
+        attention_mask=decode_attention_mask,
+        start_positions=start_positions,
+        seq_block_ids=seq_block_ids,
+        cache_state=cache_state,
+    )
+    tokens = torch.tensor(model.extract_tokens_from_logits(logits, [1])).unsqueeze(1)
+    print(f"  : tokens = {tokens}")
+
+    def save_prefill_module(model):
+        from iree.compiler.extras.fx_importer import FxImporter
+        from iree.compiler.ir import AsmState
+
+        importer = FxImporter()
+
+        print("Generating FX graph")
+
+        class InferenceModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.add_module("prefill", model)
+
+            def forward(self, tokens, attention_mask, seq_block_ids, *cache_state):
+                return self.prefill.prefill(
+                    tokens,
+                    attention_mask=attention_mask,
+                    seq_block_ids=seq_block_ids,
+                    cache_state=list(cache_state),
+                )
+
+        infmod = InferenceModule()
+        prog = torch.export.export(
+            infmod, (tokens, attention_mask, seq_block_ids) + tuple(cache_state)
+        )
+
+        print(f"FX prog:", prog)
+        importer.import_program(prog, func_name="prefill")
+        output_file = "/tmp/prefill.mlirbc"
+        print("Saving to:", output_file)
+        with open(output_file, "wb") as f:
+            importer.module_op.write_bytecode(f)
+
+
+if __name__ == "__main__":
+    sys.exit(main(sys.argv[1:]))
@@ -0,0 +1,48 @@
+# Copyright 2024 Advanced Micro Devices, Inc
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import sys
+
+import torch
+
+from sharktank.layers import *
+from sharktank.types import *
+from sharktank.models.mixtral.mixtral_ref import *
+
+
+def main(args: list[str]):
+    from ..utils import cli
+
+    torch.no_grad().__enter__()
+
+    parser = cli.create_parser()
+    cli.add_input_dataset_options(parser)
+    args = cli.parse(parser)
+
+    dataset = cli.get_input_dataset(args)
+    hp = configs.LlamaHParams.from_gguf_props(dataset.properties)
+    ref_llama_config = RefLlamaModelConfig(hp)
+    ref_llama_config.activation_dtype = torch.float16
+    model = DirectCacheMixtralModelV1(dataset.root_theta, ref_llama_config)
+
+    kv_cache = model.create_cache(bs=1)
+    start_index = 0
+    next_tokens = [1, 1059, 31871, 1217, 322, 266, 3682, 6075, 31902, 13, 31849, 31871]
+    print(f"Step {start_index}")
+    tokens = model.forward(
+        torch.tensor([next_tokens]), start_index=start_index, local_kv_cache=kv_cache
+    )
+    print(f"  : tokens = {tokens}")
+
+    # Decode a step.
+    print("Decoding...")
+    print(tokens.shape, tokens)
+    decode_token = model.forward(tokens, start_index=12, local_kv_cache=kv_cache)
+    print(f"  : decode tokens = {decode_token}")
+
+
+if __name__ == "__main__":
+    sys.exit(main(sys.argv[1:]))
@@ -12,5 +12,10 @@
 from .norm import RMSNormLayer
 from .rotary_embedding import RotaryEmbeddingLayer
 from .token_embedding import TokenEmbeddingLayer
+from .llama_attention_block import LlamaAttentionBlock
+from .paged_llama_attention_block import PagedLlamaAttentionBlock
+from .ffn_block import FFN
+from .ffn_moe_block import FFNMOE
+from .mixture_of_experts_block import SparseMoeBlock
 
 from . import configs
@@ -16,11 +16,8 @@
 from ..utils import debugging
 
 __all__ = [
-    "LinearLayer",
-    "RotaryEmbeddingLayer",
-    "RMSNormLayer",
+    "BaseLayer",
     "ThetaLayer",
-    "TokenEmbedding",
 ]
 
 
 
@@ -19,9 +19,7 @@
 
 import torch
 
-__all__ = [
-    "LlamaHParams",
-]
+__all__ = ["LlamaHParams"]
 
 
 @dataclass
@@ -36,14 +34,21 @@ class LlamaHParams:
     block_count: int
     feed_forward_length: int
     rope_dimension_count: int
+    rope_freq_base: float
     attention_head_count: int
     attn_head_dim: int
     attention_layer_norm_rms_epsilon: float
     attention_head_count_kv: int
+    expert_count: int
+    expert_used_count: int
 
     @staticmethod
     def from_gguf_props(p: dict[str, Any]):
+        default_expert_count = 0
+        default_expert_used_count = 0
+        default_rope_freq_base = 10000.0
         attention_head_count = _int_prop(p, "llama.attention.head_count")
+
         return LlamaHParams(
             context_length=_int_prop(p, "llama.context_length"),
             embedding_length=_int_prop(p, "llama.embedding_length"),
@@ -58,6 +63,15 @@ def from_gguf_props(p: dict[str, Any]):
             attention_head_count_kv=_optional_int_prop(
                 p, "llama.attention.head_count_kv", attention_head_count
             ),
+            rope_freq_base=_optional_float_prop(
+                p, "llama.rope.freq_base", default_rope_freq_base
+            ),
+            expert_count=_optional_int_prop(
+                p, "llama.expert_count", default_expert_count
+            ),
+            expert_used_count=_optional_int_prop(
+                p, "llama.expert_used_count", default_expert_used_count
+            ),
         )
 
 
@@ -79,10 +93,16 @@ def _int_prop(p: dict[str, Any], name: str) -> int:
         raise KeyError(f"Property '{name}' not found (among keys {p.keys()})")
 
 
+def _optional_float_prop(p: dict[str, Any], name: str, default_value: float) -> float:
+    value = p.get(name, default_value)
+    try:
+        return float(value)
+    except ValueError as e:
+        raise ValueError(f"Property '{name}' expected to be a float and was not") from e
+
+
 def _optional_int_prop(p: dict[str, Any], name: str, default_value: int) -> int:
-    value = p[name]
-    if value is None:
-        return default_value
+    value = p.get(name, default_value)
     try:
         return int(value)
     except ValueError as e:
 
@@ -0,0 +1,38 @@
+# Copyright 2024 Advanced Micro Devices, Inc
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from typing import Optional
+
+import torch
+import torch.nn.functional as F
+
+from .base import Theta, ThetaLayer
+from .linear import LinearLayer
+
+__all__ = [
+    "FFN",
+]
+
+
+class FFN(ThetaLayer):
+    def __init__(
+        self,
+        theta: Theta,
+    ):
+        super().__init__(theta)
+
+        self.add_module("ffn_gate", LinearLayer(theta("ffn_gate")))
+        self.add_module("ffn_up", LinearLayer(theta("ffn_up")))
+        self.add_module("ffn_down", LinearLayer(theta("ffn_down")))
+
+    def forward(
+        self,
+        h: torch.Tensor,
+    ):
+        ffn_gate = F.silu(self.ffn_gate(h))
+        ffn_up = self.ffn_up(h)
+        ffn_down = self.ffn_down(ffn_gate * ffn_up)
+        return ffn_down