google
diff --git a/‎gemma/config.py
Lines changed: 204 additions & 39 deletions b/‎gemma/config.py
Lines changed: 204 additions & 39 deletions
@@ -16,8 +16,10 @@
 
 import dataclasses
 import enum
+import os
 import torch
 from typing import Optional, Sequence
+from .siglip_vision import config as siglip_vision_config
 
 
 # Keep a mapping from dtype strings to the supported torch dtypes.
@@ -37,6 +39,7 @@ class AttentionType(enum.Enum):
 class Architecture(enum.Enum):
     GEMMA_1 = 1
     GEMMA_2 = 2
+    GEMMA_3 = 3
 
 
 @dataclasses.dataclass
@@ -66,7 +69,9 @@ class GemmaConfig:
     # Whether a quantized version of the model is used.
     quant: bool = False
     # The path to the model tokenizer.
-    tokenizer: Optional[str] = 'tokenizer/tokenizer.model'
+    tokenizer: Optional[str] = (
+    'tokenizer/tokenizer.model'
+    )
     # The types of attention used in the layers of the model.
     attn_types: Optional[Sequence[AttentionType]] = None
     # The size of the sliding window used for local attention.
@@ -82,28 +87,38 @@ class GemmaConfig:
     use_pre_ffw_norm: bool = False
     # Whether to use post mlp normalization.
     use_post_ffw_norm: bool = False
+    # The wave length of the rotary embedding.
+    rope_wave_length: dict[AttentionType, int] | None = None
+    # Whether to use QK normalization in the attention blocks.
+    use_qk_norm: bool = False
+    # Vision model config.
+    vision_config: siglip_vision_config.SiglipVisionModelConfig | None = None
+    # The factor by which the rope wave length is divided for global layers.
+    rope_scaling_factor: int| None = None
 
     def get_dtype(self) -> Optional[torch.dtype]:
         """Gets the torch dtype from the config dtype string."""
         return _STR_DTYPE_TO_TORCH_DTYPE.get(self.dtype, None)
 
 
-def get_config_for_7b() -> GemmaConfig:
-    return GemmaConfig()
+def get_config_for_7b(dtype: str = 'bfloat16') -> GemmaConfig:
+    return GemmaConfig(dtype=dtype)
 
 
-def get_config_for_2b() -> GemmaConfig:
+def get_config_for_2b(dtype: str = 'bfloat16') -> GemmaConfig:
     return GemmaConfig(
+        dtype=dtype,
         num_hidden_layers=18,
         num_attention_heads=8,
         num_key_value_heads=1,
         hidden_size=2048,
-        intermediate_size=16384
+        intermediate_size=16384,
     )
 
 
-def get_config_for_2b_v2() -> GemmaConfig:
+def get_config_for_2b_v2(dtype: str = 'bfloat16') -> GemmaConfig:
     return GemmaConfig(
+        dtype=dtype,
         architecture=Architecture.GEMMA_2,
         num_hidden_layers=26,
         num_attention_heads=8,
@@ -120,8 +135,9 @@ def get_config_for_2b_v2() -> GemmaConfig:
     )
 
 
-def get_config_for_9b() -> GemmaConfig:
+def get_config_for_9b(dtype: str = 'bfloat16') -> GemmaConfig:
     return GemmaConfig(
+        dtype=dtype,
         architecture=Architecture.GEMMA_2,
         num_hidden_layers=42,
         num_attention_heads=16,
@@ -138,38 +154,187 @@ def get_config_for_9b() -> GemmaConfig:
     )
 
 
-def get_config_for_27b() -> GemmaConfig:
-    return GemmaConfig(
-        architecture=Architecture.GEMMA_2,
-        num_hidden_layers=46,
-        num_attention_heads=32,
-        num_key_value_heads=16,
-        hidden_size=4608,
-        intermediate_size=36864,
-        use_pre_ffw_norm=True,
-        use_post_ffw_norm=True,
-        final_logit_softcapping=30.0,
-        attn_logit_softcapping=50.0,
-        head_dim=128,
-        attn_types=[AttentionType.LOCAL_SLIDING, AttentionType.GLOBAL] * 23,
-        sliding_window_size=4096,
-        query_pre_attn_scalar=144,  # hidden_size / num_attention_heads
-    )
+def get_config_for_27b(dtype: str = 'bfloat16') -> GemmaConfig:
+  return GemmaConfig(
+      dtype=dtype,
+      architecture=Architecture.GEMMA_2,
+      num_hidden_layers=46,
+      num_attention_heads=32,
+      num_key_value_heads=16,
+      hidden_size=4608,
+      intermediate_size=36864,
+      use_pre_ffw_norm=True,
+      use_post_ffw_norm=True,
+      final_logit_softcapping=30.0,
+      attn_logit_softcapping=50.0,
+      head_dim=128,
+      attn_types=[AttentionType.LOCAL_SLIDING, AttentionType.GLOBAL] * 23,
+      sliding_window_size=4096,
+      query_pre_attn_scalar=144,  # hidden_size / num_attention_heads
+  )
 
 
-def get_model_config(variant: str) -> GemmaConfig:
-    if variant == '7b':
-        return get_config_for_7b()
-    elif variant == '2b':
-        return get_config_for_2b()
-    elif variant == '2b-v2':
-        return get_config_for_2b_v2()
-    elif variant == '9b':
-        return get_config_for_9b()
-    elif variant == '27b':
-        return get_config_for_27b()
-    else:
-        raise ValueError(
-                f'Invalid variant {variant}. Supported variants are "2b"'
-                 'and "7b" and "9b" and "27b".')
+def get_config_for_1b(dtype: str) -> GemmaConfig:
+  return GemmaConfig(
+      dtype=dtype,
+      architecture=Architecture.GEMMA_3,
+      num_hidden_layers=26,
+      num_attention_heads=4,
+      num_key_value_heads=1,
+      hidden_size=1152,
+      intermediate_size=6912,
+      use_pre_ffw_norm=True,
+      use_post_ffw_norm=True,
+      head_dim=256,
+      attn_types=(
+          AttentionType.LOCAL_SLIDING,
+          AttentionType.LOCAL_SLIDING,
+          AttentionType.LOCAL_SLIDING,
+          AttentionType.LOCAL_SLIDING,
+          AttentionType.LOCAL_SLIDING,
+          AttentionType.GLOBAL,
+      ),
+      sliding_window_size=512,
+      rope_wave_length={
+          AttentionType.LOCAL_SLIDING: 10_000,
+          AttentionType.GLOBAL: 1_000_000,
+      },
+      vocab_size=262_144,
+      max_position_embeddings=32_768,
+      tokenizer='tokenizer/gemma3_cleaned_262144_v2.spiece.model',
+      use_qk_norm=True,
+      vision_config=None,
+  )
 
+
+def get_config_for_4b(dtype: str) -> GemmaConfig:
+  return GemmaConfig(
+      dtype=dtype,
+      architecture=Architecture.GEMMA_3,
+      num_hidden_layers=34,
+      num_attention_heads=8,
+      num_key_value_heads=4,
+      hidden_size=2560,
+      intermediate_size=10240,
+      use_pre_ffw_norm=True,
+      use_post_ffw_norm=True,
+      head_dim=256,
+      attn_types=(
+          AttentionType.LOCAL_SLIDING,
+          AttentionType.LOCAL_SLIDING,
+          AttentionType.LOCAL_SLIDING,
+          AttentionType.LOCAL_SLIDING,
+          AttentionType.LOCAL_SLIDING,
+          AttentionType.GLOBAL,
+      ),
+      sliding_window_size=1024,
+      rope_wave_length={
+          AttentionType.LOCAL_SLIDING: 10_000,
+          AttentionType.GLOBAL: 1_000_000,
+      },
+      vocab_size=262_144,
+      tokenizer='tokenizer/gemma3_cleaned_262144_v2.spiece.model',
+      use_qk_norm=True,
+      vision_config=siglip_vision_config.get_siglip_vision_model_config(),
+      rope_scaling_factor=8,
+  )
+
+
+def get_config_for_12b(dtype: str) -> GemmaConfig:
+  return GemmaConfig(
+      dtype=dtype,
+      architecture=Architecture.GEMMA_3,
+      num_hidden_layers=48,
+      num_attention_heads=16,
+      num_key_value_heads=8,
+      hidden_size=3840,
+      intermediate_size=3840 * 8 // 2,
+      use_pre_ffw_norm=True,
+      use_post_ffw_norm=True,
+      head_dim=256,
+      attn_types=(
+          AttentionType.LOCAL_SLIDING,
+          AttentionType.LOCAL_SLIDING,
+          AttentionType.LOCAL_SLIDING,
+          AttentionType.LOCAL_SLIDING,
+          AttentionType.LOCAL_SLIDING,
+          AttentionType.GLOBAL,
+      ),
+      sliding_window_size=1024,
+      rope_wave_length={
+          AttentionType.LOCAL_SLIDING: 10_000,
+          AttentionType.GLOBAL: 1_000_000,
+      },
+      vocab_size=262_144,
+      max_position_embeddings=131_072,
+      tokenizer='tokenizer/gemma3_cleaned_262144_v2.spiece.model',
+      use_qk_norm=True,
+      vision_config=siglip_vision_config.get_siglip_vision_model_config(),
+      rope_scaling_factor=8,
+  )
+
+
+def get_config_for_27b_v3(dtype: str) -> GemmaConfig:
+  return GemmaConfig(
+      dtype=dtype,
+      architecture=Architecture.GEMMA_3,
+      num_hidden_layers=62,
+      num_attention_heads=32,
+      num_key_value_heads=16,
+      hidden_size=5376,
+      intermediate_size=5376 * 8 // 2,
+      use_pre_ffw_norm=True,
+      use_post_ffw_norm=True,
+      head_dim=128,
+      query_pre_attn_scalar=5376 // 32,
+      attn_types=(
+          AttentionType.LOCAL_SLIDING,
+          AttentionType.LOCAL_SLIDING,
+          AttentionType.LOCAL_SLIDING,
+          AttentionType.LOCAL_SLIDING,
+          AttentionType.LOCAL_SLIDING,
+          AttentionType.GLOBAL,
+      ),
+      sliding_window_size=1024,
+      rope_wave_length={
+          AttentionType.LOCAL_SLIDING: 10_000,
+          AttentionType.GLOBAL: 1_000_000,
+      },
+      vocab_size=262_144,
+      max_position_embeddings=131_072,
+      tokenizer='tokenizer/gemma3_cleaned_262144_v2.spiece.model',
+      use_qk_norm=True,
+      vision_config=siglip_vision_config.get_siglip_vision_model_config(),
+      rope_scaling_factor=8,
+  )
+
+
+def get_model_config(variant: str, dtype: str = 'bfloat16') -> GemmaConfig:
+  """Gets the GemmaConfig for the diresired variant and dtype."""
+  # Gemma1 variants
+  if variant == '7b':
+    return get_config_for_7b(dtype)
+  elif variant == '2b':
+    return get_config_for_2b(dtype)
+  # Gemma2 variants
+  elif variant == '2b-v2':
+    return get_config_for_2b_v2(dtype)
+  elif variant == '9b':
+    return get_config_for_9b(dtype)
+  elif variant == '27b':
+    return get_config_for_27b(dtype)
+  # Gemma3 variants
+  elif variant == '1b':
+    return get_config_for_1b(dtype)
+  elif variant == '4b':
+    return get_config_for_4b(dtype)
+  elif variant == '12b':
+    return get_config_for_12b(dtype)
+  elif variant == '27b_v3':
+    return get_config_for_27b_v3(dtype)
+  # Invalid variants
+  else:
+    raise ValueError(
+        f'Invalid variant {variant}. Supported variants are "1b", "2b", '
+        '"2b-v2", "4b",, "7b", "9b" "12b", "27b", and "27b_v3".'
+    )