From 949ef993885e3941e20a496f0c0330d24b74489c Mon Sep 17 00:00:00 2001 From: Vivek Agrawal <197589114+amd-vivekag@users.noreply.github.com> Date: Wed, 26 Feb 2025 10:42:26 +0530 Subject: [PATCH] Fixes the bug of moving input_names after tokenizer call (#449) Fixes the bug introduced in commit: "Fixes native inference input size mistmatch issue (#447)" tokenizer.model_input_names should be updated before tokenizer call. --- alt_e2eshark/onnx_tests/models/hf_models.py | 41 ++++++++++----------- 1 file changed, 19 insertions(+), 22 deletions(-) diff --git a/alt_e2eshark/onnx_tests/models/hf_models.py b/alt_e2eshark/onnx_tests/models/hf_models.py index 8efc68c1..ac61b726 100644 --- a/alt_e2eshark/onnx_tests/models/hf_models.py +++ b/alt_e2eshark/onnx_tests/models/hf_models.py @@ -171,7 +171,7 @@ def get_tokenizer_from_model_path(model_repo_path: str, cache_dir: str | Path): if 'kobert' in name.lower(): trust_remote_code = True - return AutoTokenizer.from_pretrained(model_repo_path, cache_dir=cache_dir, trust_remote_code=True) + return AutoTokenizer.from_pretrained(model_repo_path, cache_dir=cache_dir, trust_remote_code=trust_remote_code) def build_repo_to_model_map(): @@ -245,32 +245,29 @@ def construct_inputs(self): tokenizer = get_tokenizer_from_model_path(self.model_repo_path, self.cache_dir) - tokens = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True) - self.input_name_to_shape_map = {k: v.shape for (k, v) in tokens.items()} - if self.name in models_with_input_names_2: # Handles 2 inputs tokenizer.model_input_names = ["input_ids", "attention_mask"] - inputs = (*list(tokens.values()), ) - else: - self.input_name_to_shape_map["position_ids"] = self.input_name_to_shape_map["input_ids"] - zeros = torch.zeros(*(self.input_name_to_shape_map["input_ids"]), dtype=int) - if self.name in models_with_input_names_3: - # Handles 3 inputs - tokenizer.model_input_names = ["input_ids", "attention_mask", "position_ids"] - elif self.name in models_with_input_names_4: - tokenizer.model_input_names = ["input_ids", "bbox", "attention_mask", "position_ids"] - - # Handles 4 inputs - # Tokenizer is returning tokens dict with key token_type_ids" instead of "bbox". - # For now, "token_type_ids" will be reused as bbox in this case - # bbox is a bounding box with size [?, ?, 4] - # where each 4 numbers represent x_min, y_min, x_max, y_max - tokens["token_type_ids"] = tokens["token_type_ids"].unsqueeze(-1).repeat(1, 1, 4) - else: - raise RuntimeError(f"Model: {self.name} not found in any of the registry lists.") + tokens = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True) + + if self.name in models_with_input_names_4: + # Handles 4 inputs + # Tokenizer is returning tokens dict with key token_type_ids" instead of "bbox". + # For now, "token_type_ids" will be reused as bbox in this case + # bbox is a bounding box with size [?, ?, 4] + # where each 4 numbers represent x_min, y_min, x_max, y_max + print(f'DEBUG: {tokens=}') + tokens["token_type_ids"] = tokens["token_type_ids"].unsqueeze(-1).repeat(1, 1, 4) + + self.input_name_to_shape_map = {k: v.shape for (k, v) in tokens.items()} + if self.name in models_with_input_names_3 or self.name in models_with_input_names_4: + # Handles 3 and 4 inputs + self.input_name_to_shape_map["position_ids"] = self.input_name_to_shape_map["input_ids"] + zeros = torch.zeros(*(self.input_name_to_shape_map["position_ids"]), dtype=int) inputs = (*list(tokens.values()), zeros) + else: + inputs = (*list(tokens.values()), ) test_tensors = TestTensors(inputs) return test_tensors