OML-Team
diff --git a/‎oml/configs/extractor/vit_v2.yaml
Lines changed: 5 additions & 0 deletions b/‎oml/configs/extractor/vit_v2.yaml
Lines changed: 5 additions & 0 deletions
diff --git a/‎oml/models/__init__.py
Lines changed: 1 addition & 0 deletions b/‎oml/models/__init__.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎oml/models/vit_dinov2/__init__.py b/‎oml/models/vit_dinov2/__init__.py
diff --git a/‎oml/models/vit_dinov2/external/__init__.py b/‎oml/models/vit_dinov2/external/__init__.py
diff --git a/‎oml/models/vit_dinov2/external/config.py
Lines changed: 17 additions & 0 deletions b/‎oml/models/vit_dinov2/external/config.py
Lines changed: 17 additions & 0 deletions
diff --git a/‎oml/models/vit_dinov2/external/hubconf.py
Lines changed: 138 additions & 0 deletions b/‎oml/models/vit_dinov2/external/hubconf.py
Lines changed: 138 additions & 0 deletions
@@ -0,0 +1,5 @@
+name: vit_v2
+args:
+  arch: vits14
+  normalise_features: False
+  weights: null
@@ -7,4 +7,5 @@
 from oml.models.resnet.extractor import ResnetExtractor
 from oml.models.vit_clip.extractor import ViTCLIPExtractor
 from oml.models.vit_dino.extractor import ViTExtractor
+from oml.models.vit_dinov2.extractor import ViTExtractor_v2
 from oml.models.vit_unicom.extractor import ViTUnicomExtractor
@@ -0,0 +1,17 @@
+# References:
+#   https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/config.py
+
+import os
+
+import torch
+
+# use torch.scaled_dot_product_attention where possible
+_HAS_FUSED_ATTN = hasattr(torch.nn.functional, "scaled_dot_product_attention")
+_USE_FUSED_ATTN = int(os.environ.get("USE_FUSED_ATTN", 0))
+
+
+def use_fused_attn() -> bool:
+    # NOTE: ONNX export cannot handle F.scaled_dot_product_attention as of pytorch 2.0
+    if not _HAS_FUSED_ATTN:
+        return False
+    return _USE_FUSED_ATTN > 0
@@ -0,0 +1,138 @@
+# type: ignore
+# flake8: noqa
+
+
+from enum import Enum
+from typing import Union
+
+import torch
+
+import oml.models.vit_dinov2.external.vision_transformer as vits
+from oml.const import CKPT_SAVE_ROOT
+
+# ============== CODE FROM DINOV2 ==============
+# https://github.com/facebookresearch/dinov2/blob/main/dinov2/hub/backbones.py
+
+_DINOV2_BASE_URL = "https://dl.fbaipublicfiles.com/dinov2"
+
+
+def _make_dinov2_model_name(arch_name: str, patch_size: int, num_register_tokens: int = 0) -> str:
+    compact_arch_name = arch_name.replace("_", "")[:4]
+    registers_suffix = f"_reg{num_register_tokens}" if num_register_tokens else ""
+    return f"dinov2_{compact_arch_name}{patch_size}{registers_suffix}"
+
+
+class Weights(Enum):
+    LVD142M = "LVD142M"
+
+
+def _make_dinov2_model(
+    *,
+    arch_name: str = "vit_large",
+    img_size: int = 518,
+    patch_size: int = 14,
+    init_values: float = 1.0,
+    ffn_layer: str = "mlp",
+    block_chunks: int = 0,
+    num_register_tokens: int = 0,
+    interpolate_antialias: bool = False,
+    interpolate_offset: float = 0.1,
+    pretrained: bool = True,
+    weights: Union[Weights, str] = Weights.LVD142M,
+    **kwargs,
+):
+    if isinstance(weights, str):
+        try:
+            weights = Weights[weights]
+        except KeyError as e:
+            raise AssertionError(f"Unsupported weights: {weights}") from e
+
+    model_base_name = _make_dinov2_model_name(arch_name, patch_size)
+    vit_kwargs = {
+        "img_size": img_size,
+        "patch_size": patch_size,
+        "init_values": init_values,
+        "ffn_layer": ffn_layer,
+        "block_chunks": block_chunks,
+        "num_register_tokens": num_register_tokens,
+        "interpolate_antialias": interpolate_antialias,
+        "interpolate_offset": interpolate_offset,
+    }
+    vit_kwargs.update(**kwargs)
+    model = vits.__dict__[arch_name](**vit_kwargs)
+
+    if pretrained:
+        model_full_name = _make_dinov2_model_name(arch_name, patch_size, num_register_tokens)
+        url = _DINOV2_BASE_URL + f"/{model_base_name}/{model_full_name}_pretrain.pth"
+        state_dict = torch.hub.load_state_dict_from_url(
+            url, map_location="cpu", model_dir=str(CKPT_SAVE_ROOT.resolve())
+        )
+        model.load_state_dict(state_dict, strict=True)
+
+    return model
+
+
+def dinov2_vits14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-S/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(arch_name="vit_small", pretrained=pretrained, weights=weights, **kwargs)
+
+
+def dinov2_vitb14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-B/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(arch_name="vit_base", pretrained=pretrained, weights=weights, **kwargs)
+
+
+def dinov2_vitl14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-L/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(arch_name="vit_large", pretrained=pretrained, weights=weights, **kwargs)
+
+
+def dinov2_vits14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-S/14 model with registers (optionally) pretrained on the LVD-142M dataset
+    """
+    return _make_dinov2_model(
+        arch_name="vit_small",
+        pretrained=pretrained,
+        weights=weights,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )
+
+
+def dinov2_vitb14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-B/14 model with registers (optionally) pretrained on the LVD-142M dataset
+    """
+    return _make_dinov2_model(
+        arch_name="vit_base",
+        pretrained=pretrained,
+        weights=weights,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )
+
+
+def dinov2_vitl14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-L/14 model with registers (optionally) pretrained on the LVD-142M dataset
+    """
+    return _make_dinov2_model(
+        arch_name="vit_large",
+        pretrained=pretrained,
+        weights=weights,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )