nod-ai · sogartar · Dec 9, 2024 · Nov 28, 2024 · Dec 4, 2024 · Dec 5, 2024
diff --git a/.github/workflows/ci-sharktank.yml b/.github/workflows/ci-sharktank.yml
@@ -122,9 +122,13 @@ jobs:
             iree-base-runtime
 
       - name: Run tests
+      # TODO: unify with-t5-data and with-clip-data flags into a single flag
+      # and make it possible to run only tests that require data.
         run: |
           pytest \
+          --with-clip-data \
             --with-t5-data \
+            sharktank/tests/models/clip/clip_test.py \
             sharktank/tests/models/t5/t5_test.py \
             --durations=0
 

diff --git a/sharktank/conftest.py b/sharktank/conftest.py
@@ -88,6 +88,15 @@ def pytest_addoption(parser):
         help="Enable all llama benchmarking tests",
     )
 
+    parser.addoption(
+        "--with-clip-data",
+        action="store_true",
+        default=False,
+        help=(
+            "Enable tests that use CLIP data like models that is not a part of the source "
+            "code. The user is expected to provide the data"
+        ),
+    )
     parser.addoption(
         "--with-t5-data",
         action="store_true",

diff --git a/sharktank/sharktank/layers/__init__.py b/sharktank/sharktank/layers/__init__.py
@@ -9,7 +9,7 @@
 from .kv_cache import BaseKVCache, DirectKVCache, PagedKVCache
 from .causal_llm import BaseCausalLMModel
 from .linear import LinearLayer
-from .norm import RMSNormLayer
+from .norm import RMSNormLayer, LayerNorm
 from .rotary_embedding import RotaryEmbeddingLayer
 from .token_embedding import TokenEmbeddingLayer
 from .llama_attention_block import LlamaAttentionBlock

diff --git a/sharktank/sharktank/layers/activations.py b/sharktank/sharktank/layers/activations.py
@@ -0,0 +1,16 @@
+# Copyright 2024 Advanced Micro Devices, Inc.
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from torch import nn
+from .. import ops
+
+# TODO: don't use nn.functional directly.
+ACT2FN = {
+    "gelu": nn.functional.gelu,
+    "gelu_new": ops.gelu_tanh_approximation,
+    "relu": nn.functional.relu,
+    "quick_gelu": ops.gelu_sigmoid_approximation,
+}
diff --git a/sharktank/sharktank/layers/configs/llm_configs.py b/sharktank/sharktank/layers/configs/llm_configs.py
@@ -14,11 +14,11 @@
 (and indeed, can bootstrap these off of GGUF files).
 """
 
-from dataclasses import dataclass, field
+from dataclasses import asdict, dataclass, field
 from typing import Any, Optional
 import torch
 
-__all__ = ["LlamaHParams", "LlamaModelConfig", "T5Config"]
+__all__ = ["ClipTextConfig", "LlamaHParams", "LlamaModelConfig", "T5Config"]
 
 
 @dataclass
@@ -266,3 +266,49 @@ def from_gguf_properties(properties: dict[str, Any], **kwargs):
         all_kwargs.update(kwargs)
 
         return T5Config(**all_kwargs)
+
+
+@dataclass
+class ClipTextConfig:
+    vocab_size: int = 49408
+    hidden_size: int = 512
+    intermediate_size: int = 2048
+    projection_dim: int = 512
+    num_hidden_layers: int = 12
+    num_attention_heads: int = 8
+    max_position_embeddings: int = 77
+    hidden_act: str = "quick_gelu"
+    layer_norm_eps: float = 1e-5
+    # This differs from `CLIPTokenizer`'s default and from openai/clip
+    # See https://github.com/huggingface/transformers/pull/24773#issuecomment-1632287538
+    pad_token_id: int = 1
+    bos_token_id: int = 49406
+    eos_token_id: int = 49407
+    output_attentions: bool = False
+    output_hidden_states: bool = False
+    use_return_dict: bool = True
+
+    @staticmethod
+    def from_transformers_clip_text_config(
+        config: "transformers.CLIPTextConfig",
+    ) -> "ClipTextConfig":
+        return ClipTextConfig(
+            vocab_size=config.vocab_size,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            projection_dim=config.projection_dim,
+            num_hidden_layers=config.num_hidden_layers,
+            num_attention_heads=config.num_attention_heads,
+            max_position_embeddings=config.max_position_embeddings,
+            hidden_act=config.hidden_act,
+            layer_norm_eps=config.layer_norm_eps,
+            pad_token_id=config.pad_token_id,
+            bos_token_id=config.bos_token_id,
+            eos_token_id=config.eos_token_id,
+            output_attentions=config.output_attentions,
+            output_hidden_states=config.output_hidden_states,
+            use_return_dict=config.use_return_dict,
+        )
+
+    def as_properties(self) -> dict[str, Any]:
+        return asdict(self)
diff --git a/sharktank/sharktank/layers/norm.py b/sharktank/sharktank/layers/norm.py
@@ -39,3 +39,23 @@ def forward(self, x: torch.Tensor):
         # often in higher precision. Downcast back to expected.
         norm = ops.to(norm, orig_dtype)
         return norm
+
+
+class LayerNorm(ThetaLayer):
+    def __init__(
+        self,
+        theta: Theta,
+        *,
+        weight_name: str = "weight",
+        bias_name: str = "bias",
+        eps: float = 1e-05,
+    ):
+        super().__init__(theta)
+        self.weight = self.theta_tensor(weight_name)
+        self.bias = None
+        if bias_name in self.theta.keys:
+            self.bias = self.theta_tensor(bias_name)
+        self.eps = eps
+
+    def forward(self, x: torch.Tensor):
+        return ops.layer_norm(x, weight=self.weight, bias=self.bias, eps=self.eps)
diff --git a/sharktank/sharktank/models/clip/__init__.py b/sharktank/sharktank/models/clip/__init__.py
@@ -0,0 +1,8 @@
+# Copyright 2024 Advanced Micro Devices, Inc
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from .clip import *
+from .export import *