Run gemma3 on gpus

MayankChaturvedi · MayankChaturvedi · commit 0dd75b06fe42 · 2025-03-12T05:37:41.000Z
diff --git a/gemma/gemma3_model.py b/gemma/gemma3_model.py
@@ -97,7 +97,7 @@ def forward(self,
             self.global_freqs_cis.index_select(0, input_positions)
         )
     hidden_states = self.text_token_embedder(input_token_ids)
-    normalizer = torch.tensor(self.config.hidden_size**0.5, dtype=hidden_states.dtype)
+    normalizer = torch.tensor(self.config.hidden_size**0.5, dtype=hidden_states.dtype, device=hidden_states.device)
     hidden_states = hidden_states * normalizer
     if image_patches is not None and self.config.vision_config is not None:
       # the input has images
@@ -127,7 +127,7 @@ def forward(self,
     embedder_weight = self.text_token_embedder.weight
     if self.config.quant:
       embedder_weight = (
-                embedder_weight * self.embedder.weight_scaler.unsqueeze(-1))
+                embedder_weight * self.text_token_embedder.weight_scaler.unsqueeze(-1))
 
     next_tokens, logits = self.sampler(
             embedding=embedder_weight,
@@ -162,7 +162,7 @@ def populate_image_embeddings(self,
 
   def create_attention_mask(self, input_ids: torch.Tensor, sequence_length: int):
     batch_size = input_ids.shape[0]
-    causal_mask = torch.tril(torch.ones((batch_size, 1, sequence_length, sequence_length), dtype=torch.bool))
+    causal_mask = torch.tril(torch.ones((batch_size, 1, sequence_length, sequence_length), dtype=torch.bool, device=input_ids.device))
     image_token_mask = input_ids == self.tokenizer.image_token_placeholder_id
     # Pad the mask to the left with 0. This is to make sure the boundary
     # detection works correctly. Boundary (starting index of image patch) is
@@ -202,7 +202,7 @@ def create_attention_mask(self, input_ids: torch.Tensor, sequence_length: int):
     # local attention is within the sliding window.
     local_mask = torch.logical_and(
             attention_mask,
-            torch.triu(torch.ones((1, 1, sequence_length, sequence_length), dtype=torch.bool), diagonal=-(self.config.sliding_window_size-1))
+            torch.triu(torch.ones((1, 1, sequence_length, sequence_length), dtype=torch.bool, device=input_ids.device), diagonal=-(self.config.sliding_window_size-1))
         )
     return attention_mask, local_mask
 
@@ -233,8 +233,8 @@ def generate(
     if self.config.sliding_window_size is None:
       raise ValueError('gemma 3 model requires sliding_window size')
     boolean_mask, local_boolean_mask = self.create_attention_mask(user_input_token_ids, total_seq_len)
-    mask_tensor = torch.where(boolean_mask, 0, min_dtype).contiguous()
-    local_mask_tensor = torch.where(local_boolean_mask, 0, min_dtype).contiguous()
+    mask_tensor = torch.where(boolean_mask, 0, torch.tensor(min_dtype, dtype=torch.float32, device=device)).contiguous()
+    local_mask_tensor = torch.where(local_boolean_mask, 0, torch.tensor(min_dtype, dtype=torch.float32, device=device)).contiguous()
 
     kv_caches = []
     for _ in range(self.config.num_hidden_layers):
@@ -247,25 +247,22 @@ def generate(
 
     input_token_ids_tensor = torch.full((batch_size, min_prompt_len),
                                             self.tokenizer.pad_id,
-                                            dtype=torch.int64)
+                                            dtype=torch.int64, device=device)
     token_ids_tensor = user_input_token_ids.to(device)
     for i in range(batch_size):
       p = user_input_token_ids[i]
       input_token_ids_tensor[i, :min_prompt_len] = p[:min_prompt_len]
 
-    token_ids_tensor = token_ids_tensor.to(device)
-    input_token_ids_tensor = input_token_ids_tensor.to(device)
-    input_positions_tensor = torch.arange(0, min_prompt_len, dtype=torch.int64).to(device)
+    input_positions_tensor = torch.arange(0, min_prompt_len, dtype=torch.int64, device=device)
     prompt_mask_tensor = token_ids_tensor != self.tokenizer.pad_id
     curr_mask_tensor = mask_tensor.index_select(2, input_positions_tensor)
     curr_local_mask_tensor = local_mask_tensor.index_select(2, input_positions_tensor)
-    output_positions_tensor = torch.LongTensor([min_prompt_len - 1])
+    output_positions_tensor = torch.LongTensor([min_prompt_len - 1]).to(device)
     temperatures_tensor = None if not temperature else torch.FloatTensor(
             [temperature] * batch_size).to(device)
     top_ps_tensor = torch.FloatTensor([top_p] * batch_size).to(device)
     top_ks_tensor = torch.LongTensor([top_k] * batch_size).to(device)
-    output_index = torch.tensor(min_prompt_len, dtype=torch.int64).to(
-            device)
+    output_index = torch.tensor(min_prompt_len, dtype=torch.int64, device=device)
 
     # Prefill up to min_prompt_len tokens, then treat other prefill as
     # decode and ignore output.
@@ -298,8 +295,7 @@ def generate(
       curr_local_mask_tensor = local_mask_tensor.index_select(
                 2, input_positions_tensor
             ) if local_mask_tensor is not None else None
-      output_positions_tensor = torch.tensor(0, dtype=torch.int64).to(
-                device)
+      output_positions_tensor = torch.tensor(0, dtype=torch.int64, device=device)
       output_index = output_index + 1
       image_batch = None
       image_presence_mask = None
diff --git a/gemma/gemma3_preprocessor.py b/gemma/gemma3_preprocessor.py
@@ -172,7 +172,7 @@ def tokenize_raw_input(
                         config.vision_config.input_channels,
                         config.vision_config.image_size,
                         config.vision_config.image_size,
-                    )
+                    ), device=device
                 )
                 for _ in range(pad_length)
             ]
@@ -182,12 +182,12 @@ def tokenize_raw_input(
         image_presence_mask.append(presence_mask)
 
     # Convert lists to tensors
-    user_input_token_ids = torch.tensor(user_input_token_ids, dtype=torch.long).to(device)
+    user_input_token_ids = torch.tensor(user_input_token_ids, dtype=torch.long, device=device)
     if max_num_images > 0:
         image_batch = torch.stack([torch.stack(images) for images in image_batch]).to(
             device
         )
-        image_presence_mask = torch.tensor(image_presence_mask, dtype=torch.bool).to(device)
+        image_presence_mask = torch.tensor(image_presence_mask, dtype=torch.bool, device=device)
     else:
         image_batch = None
         image_presence_mask = None
diff --git a/gemma/model.py b/gemma/model.py
@@ -594,7 +594,7 @@ def forward(
     # Gemma normalizes the embedding by sqrt(hidden_size).
     # Gemma2 downcasts the below to float16, causing sqrt(3072)=55.4256 to become 55.5
     # See https://github.com/huggingface/transformers/pull/29402
-    normalizer = torch.tensor(self.config.hidden_size**0.5, dtype=hidden_states.dtype)
+    normalizer = torch.tensor(self.config.hidden_size**0.5, dtype=hidden_states.dtype, device=hidden_states.device)
     hidden_states = hidden_states * normalizer
 
     hidden_states = self.model(
@@ -677,7 +677,7 @@ def generate(
     curr_local_mask_tensor = local_mask_tensor.index_select(
           2, input_positions_tensor
       ) if local_mask_tensor is not None else None
-    output_positions_tensor = torch.LongTensor([min_prompt_len - 1]).to(device)
+    output_positions_tensor = torch.LongTensor([min_prompt_len - 1], device=device)
     temperatures_tensor = None if not temperature else torch.FloatTensor(
             [temperature] * batch_size).to(device)
     top_ps_tensor = torch.FloatTensor([top_p] * batch_size).to(device)
diff --git a/gemma/siglip_vision/siglip_vision_model.py b/gemma/siglip_vision/siglip_vision_model.py
@@ -104,7 +104,7 @@ def gelu_tanh(self, x):
         * (
             1
             + torch.tanh(
-                torch.sqrt(torch.tensor(2.0 / torch.pi))
+                torch.sqrt(torch.tensor(2.0 / torch.pi, device=x.device))
                 * (x + 0.044715 * torch.pow(x, 3))
             )
         )
@@ -192,7 +192,8 @@ def forward(
     # (batch_size,channels,height,width)->(batch_size, height*width, channels)
     x = x.flatten(2).transpose(1, 2)
 
-    x = x + self.position_embedding(self.position_ids)
+    position_ids = self.position_ids.to(pixel_values.device)
+    x = x + self.position_embedding(position_ids)
 
     for block in self.encoder_blocks:
       x = block(x)  # batch_size, height*width, embedding_dim (1152)
diff --git a/scripts/run.py b/scripts/run.py
@@ -68,7 +68,7 @@ def _set_default_tensor_type(dtype: torch.dtype):
 def main(_):
     # Construct the model config.
     model_config = config.get_model_config(FLAGS.variant)
-    model_config.dtype = "float32" if FLAGS.device == "cpu" else "float16"
+    model_config.dtype = "float32"
     model_config.quant = FLAGS.quant
 
     # Seed random.
diff --git a/scripts/run_multimodal.py b/scripts/run_multimodal.py
@@ -85,7 +85,7 @@ def _set_default_tensor_type(dtype: torch.dtype):
 def main(_):
   # Construct the model config.
   model_config = config.get_model_config(_VARIANT.value)
-  model_config.dtype = 'float32' if _DEVICE.value == 'cpu' else 'float16'
+  model_config.dtype = 'float32'
   model_config.quant = _QUANT.value
   image_paths = {"cow_in_beach": "scripts/images/cow_in_beach.jpg",
                    "lilly": "scripts/images/lilly.jpg",

Original file line number	Diff line number	Diff line change
`@@ -104,7 +104,7 @@ def gelu_tanh(self, x):`
`104`	`104`	`* (`
`105`	`105`	`1`
`106`	`106`	`+ torch.tanh(`
`107`		`- torch.sqrt(torch.tensor(2.0 / torch.pi))`
	`107`	`+ torch.sqrt(torch.tensor(2.0 / torch.pi, device=x.device))`
`108`	`108`	`* (x + 0.044715 * torch.pow(x, 3))`
`109`	`109`	`)`
`110`	`110`	`)`
`@@ -192,7 +192,8 @@ def forward(`
`192`	`192`	`# (batch_size,channels,height,width)->(batch_size, height*width, channels)`
`193`	`193`	`x = x.flatten(2).transpose(1, 2)`
`194`	`194`
`195`		`- x = x + self.position_embedding(self.position_ids)`
	`195`	`+ position_ids = self.position_ids.to(pixel_values.device)`
	`196`	`+ x = x + self.position_embedding(position_ids)`
`196`	`197`
`197`	`198`	`for block in self.encoder_blocks:`
`198`	`199`	`x = block(x) # batch_size, height*width, embedding_dim (1152)`