tenstorrent · ipotkonjak-tt · Feb 26, 2025 · Feb 28, 2025 · Feb 28, 2025 · Mar 3, 2025
@@ -64,10 +64,12 @@ def test_llama_multimodal_demo_chat(
         logger.info(f"Creating TT model on {len(mesh_device.get_devices())} devices")
         mesh_device.enable_program_cache()
         mesh_device.enable_async(True)
-        model_args, model = create_multimodal_model(mesh_device, max_batch_size=max_batch_size, max_seq_len=max_seq_len)
+        model_args, model, _ = create_multimodal_model(
+            mesh_device, max_batch_size=max_batch_size, max_seq_len=max_seq_len
+        )
         tokenizer = Tokenizer(model_path=tokenizer_path)
         formatter = ChatFormat(tokenizer)
-        generator = LlamaGenerator(model, model_args, mesh_device, tokenizer=tokenizer, formatter=formatter)
+        generator = LlamaGenerator([model], [model_args], mesh_device, tokenizer=tokenizer, formatter=formatter)
 
     # image understanding
     dialogs = []

@@ -70,10 +70,12 @@ def test_llama_multimodal_demo_text(
         logger.info(f"Creating TT model on {len(mesh_device.get_devices())} devices")
         mesh_device.enable_program_cache()
         mesh_device.enable_async(True)
-        model_args, model = create_multimodal_model(mesh_device, max_batch_size=max_batch_size, max_seq_len=max_seq_len)
+        model_args, model, _ = create_multimodal_model(
+            mesh_device, max_batch_size=max_batch_size, max_seq_len=max_seq_len
+        )
         tokenizer = Tokenizer(model_path=tokenizer_path)
         formatter = ChatFormat(tokenizer)
-        generator = LlamaGenerator(model, model_args, mesh_device, tokenizer=tokenizer, formatter=formatter)
+        generator = LlamaGenerator([model], [model_args], mesh_device, tokenizer=tokenizer, formatter=formatter)
 
     with open(IMG_PATH / "dog.jpg", "rb") as f:
         img = PIL_Image.open(f).convert("RGB")

@@ -41,14 +41,17 @@ def sample(logits):
     return sample
 
 
-def create_multimodal_model(mesh_device, max_batch_size, max_seq_len, dtype=ttnn.bfloat16, use_paged_kv_cache=False):
+def create_multimodal_model(
+    mesh_device, max_batch_size, max_seq_len, dtype=ttnn.bfloat16, use_paged_kv_cache=False, checkpoint=None
+):
     from models.demos.llama3.tt.multimodal.llama_vision_model import CrossAttentionTransformer
     from models.demos.llama3.tt.model_config import TtModelArgs
 
     tt_model_args = TtModelArgs(mesh_device, max_batch_size=max_batch_size)
     # limit length or we'll run out of space
     tt_model_args.max_seq_len = max_seq_len
-    checkpoint = torch.load(tt_model_args.consolidated_weights_path, map_location="cpu", weights_only=True)
+    if checkpoint is None:
+        checkpoint = torch.load(tt_model_args.consolidated_weights_path, map_location="cpu", weights_only=True)
     model = CrossAttentionTransformer(
         mesh_device,
         checkpoint,
@@ -57,7 +60,36 @@ def create_multimodal_model(mesh_device, max_batch_size, max_seq_len, dtype=ttnn
         configuration=tt_model_args,
         use_paged_kv_cache=use_paged_kv_cache,
     )
-    return tt_model_args, model
+    return tt_model_args, model, checkpoint
+
+
+def prepare_generator_args(
+    num_devices, data_parallel, mesh_device, max_batch_size, max_seq_len, dtype=ttnn.bfloat16, use_paged_kv_cache=False
+):
+    # Partition the mesh, singular model implemented for TP on 1xN mesh
+    submesh_devices = (
+        mesh_device.create_submeshes(ttnn.MeshShape(1, num_devices // data_parallel))
+        if isinstance(mesh_device, ttnn.MeshDevice) and data_parallel > 1
+        else [mesh_device]
+    )
+    state_dict = None
+
+    model_args = []
+    model = []
+
+    for submesh in submesh_devices:
+        model_args_i, model_i, state_dict = create_multimodal_model(
+            mesh_device=submesh,
+            max_batch_size=max_batch_size // data_parallel,
+            max_seq_len=max_seq_len,
+            dtype=dtype,
+            use_paged_kv_cache=use_paged_kv_cache,
+            checkpoint=state_dict,
+        )
+        model_args.append(model_args_i)
+        model.append(model_i)
+
+    return model_args, model
 
 
 @pytest.mark.parametrize(
@@ -84,13 +116,21 @@ def create_multimodal_model(mesh_device, max_batch_size, max_seq_len, dtype=ttnn
     ],
     ids=["batch1-notrace", "batch1-trace", "batch32-trace", "batch4-trace-with-text-prompts"],
 )
+@pytest.mark.parametrize(
+    "data_parallel",
+    [
+        1,
+        # 4,
+    ],
+)
 @pytest.mark.parametrize("device_params", [{"trace_region_size": 14951424, "num_command_queues": 2}], indirect=True)
 def test_llama_multimodal_demo_text(
     mesh_device,
     warmup_iters,
     enable_trace,
     max_batch_size,
     include_text_only_prompts,
+    data_parallel,
     test_type,
     max_seq_len,
     temperature: float = 0,
@@ -106,12 +146,22 @@ def test_llama_multimodal_demo_text(
 
     mesh_device.enable_program_cache()
     mesh_device.enable_async(True)
-    model_args, model = create_multimodal_model(mesh_device, max_batch_size=max_batch_size, max_seq_len=max_seq_len)
+
+    num_devices = mesh_device.get_num_devices() if isinstance(mesh_device, ttnn.MeshDevice) else 1
+    max_batch_size *= data_parallel  # input batch_size is interpreted as size per DP group
+
+    model_args, model = prepare_generator_args(
+        num_devices=num_devices,
+        data_parallel=data_parallel,
+        mesh_device=mesh_device,
+        max_batch_size=max_batch_size,
+        max_seq_len=max_seq_len,
+    )
     generator = LlamaGenerator(model, model_args, mesh_device)
     tokenizer = Tokenizer(model_path=tokenizer_path)
     formatter = ChatFormat(tokenizer)
 
-    xattn_caches = generator.model.setup_cache(model_args.max_batch_size)
+    xattn_caches = [model.setup_cache(model_args[i].max_batch_size) for i, model in enumerate(generator.model)]
 
     with open(IMG_PATH / "ocr_image.jpeg", "rb") as f:
         ocr_image = PIL_Image.open(f).convert("RGB")

@@ -219,7 +219,7 @@ def test_tt_model_acc(
         ) = preprocess_inputs_prefill(
             input_prompts,
             tokenizer,
-            model_args,
+            [model_args],
             instruct=False,
             max_generated_tokens=decode_len,
             max_prefill_len=prefill_len,

@@ -126,7 +126,7 @@ def test_chunked_prefill_single_user(
         weight_cache_path=model_args.weight_cache_path(dtype),
         paged_attention_config=paged_attention_config,
     )
-    generator = LlamaGenerator(tt_model, model_args, mesh_device)
+    generator = LlamaGenerator([tt_model], [model_args], mesh_device)
 
     logger.info("Model and caches loaded.")
 
@@ -150,13 +150,15 @@ def test_chunked_prefill_single_user(
     logger.info("Running TT model")
     for last_token_idx in range(prefill_chunk_size - 10, seq_len, prefill_chunk_size):
         logger.info(f"Running TT model for last_token_idx: {last_token_idx}")
-        tt_output_torch = generator.prefill_forward_single_user_text(
+        tt_output_device = generator.prefill_forward_single_user_text(
             tt_prefill_input,
             page_table=static_page_table,
             user_id=0,
             last_token_idx=last_token_idx,
             kv_cache=tt_kv_cache,
         )
+
+        tt_output_torch = tt_model.process_output_prefill(tt_output_device, last_token_idx=(last_token_idx % 32))
         tt_output_torch = tt_output_torch.reshape(batch_size, 1, -1)
 
         ref_output_slice = ref_output[:, last_token_idx : last_token_idx + 1, :]

@@ -321,7 +321,7 @@ def test_llama_model_inference(
             # Greedy decode (temperature = 0) the generated token and save it to print out later
             if run_ref_pt:
                 # Sample from reference model first
-                _, pt_out_tok = sample_host(ref_output, None, temperature=0, top_p=0.8)
+                _, pt_out_tok = sample_host(ref_output, temperature=0, top_p=0.8)
                 pt_decode_input = embd(pt_out_tok)
                 all_outputs_ref.append(pt_out_tok.squeeze(1).tolist()[0])
 
@@ -330,7 +330,7 @@ def test_llama_model_inference(
                 all_outputs.append(pt_out_tok.squeeze(1).tolist()[0])
             else:
                 # If not running reference model, sample from TT model directly
-                _, tt_out_tok = sample_host(tt_output_torch, None, temperature=0, top_p=0.8)
+                _, tt_out_tok = sample_host(tt_output_torch, temperature=0, top_p=0.8)
                 tt_decode_input = embd(tt_out_tok)
                 all_outputs.append(tt_out_tok.squeeze(1).tolist()[0])