Skip tokenizer checks in favor of AutoTokenizer (#442)

amd-vivekag · Vivek Agrawal · web-flow · commit 886550a3996e · 2025-02-24T15:20:10.000+05:30
Removes if conditions based on model name to decide Tokenizer class.
Instead, directly uses AutoTokenizer class.
Co-authored-by: Vinayak Dev &lt;vinayakdev.sci@gmail.com&gt;

---------

Co-authored-by: Vivek Agrawal &lt;vivekag@xhdvivekag40x.xlnx.xilinx.com&gt;
diff --git a/alt_e2eshark/onnx_tests/helper_classes.py b/alt_e2eshark/onnx_tests/helper_classes.py
@@ -86,6 +86,12 @@ def export_model(self, optim_level: str | None = None):
             optimize=optim_level,
         )
 
+    def __repr__(self):
+        cls = self.__class__.__name__
+        model_details = self.model_repo_path.split('/')
+        model_path = '/hf_'.join(model_details)
+        return f"{cls} (full_model_path={model_path}, task_name={self.task}, name={self.name}, onnx_model_path={os.path.dirname(self.model)})"
+
     def construct_model(self):
         model_dir = str(Path(self.model).parent)
 
@@ -100,7 +106,12 @@ def find_models(model_dir):
         found_models = find_models(model_dir)
 
         if len(found_models) == 0:
-            self.export_model()
+            try:
+                self.export_model()
+            except:
+                #print(self.__repr__())
+                raise RuntimeError("Failed to Export class: ", self)
+
             found_models = find_models(model_dir)
         if len(found_models) == 1:
             self.model = found_models[0]
diff --git a/alt_e2eshark/onnx_tests/models/external_lists/hf-model-paths/hf-audio-classification-model-list.txt b/alt_e2eshark/onnx_tests/models/external_lists/hf-model-paths/hf-audio-classification-model-list.txt
@@ -0,0 +1,2 @@
+ntu-spml/hf_distilhubert
+microsoft/hf_wavlm-base-plus
diff --git a/alt_e2eshark/onnx_tests/models/external_lists/hf-model-paths/hf-feature-extraction-model-list.txt b/alt_e2eshark/onnx_tests/models/external_lists/hf-model-paths/hf-feature-extraction-model-list.txt
@@ -13,7 +13,6 @@ BAAI/hf_bge-large-en-v1.5
 mixedbread-ai/hf_mxbai-embed-large-v1
 BAAI/hf_bge-base-en-v1.5
 facebook/hf_bart-base
-ntu-spml/hf_distilhubert
 cointegrated/hf_rubert-tiny
 sentence-transformers/hf_paraphrase-multilingual-mpnet-base-v2
 BAAI/hf_bge-large-zh-v1.5
@@ -46,7 +45,6 @@ sentence-transformers/hf_msmarco-distilbert-base-v4
 avsolatorio/hf_GIST-Embedding-v0
 sentence-transformers/hf_msmarco-distilbert-base-tas-b
 sentence-transformers/hf_paraphrase-mpnet-base-v2
-microsoft/hf_wavlm-base-plus
 avsolatorio/hf_GIST-large-Embedding-v0
 Supabase/hf_gte-small
 sentence-transformers/hf_paraphrase-MiniLM-L3-v2
diff --git a/alt_e2eshark/onnx_tests/models/external_lists/hf-model-shards/hf-audio-classification-shard.txt b/alt_e2eshark/onnx_tests/models/external_lists/hf-model-shards/hf-audio-classification-shard.txt
@@ -0,0 +1,2 @@
+hf_distilhubert
+hf_wavlm-base-plus
diff --git a/alt_e2eshark/onnx_tests/models/external_lists/hf-model-shards/hf-feature-extraction-shard.txt b/alt_e2eshark/onnx_tests/models/external_lists/hf-model-shards/hf-feature-extraction-shard.txt
@@ -13,7 +13,6 @@ hf_bge-large-en-v1.5
 hf_mxbai-embed-large-v1
 hf_bge-base-en-v1.5
 hf_bart-base
-hf_distilhubert
 hf_rubert-tiny
 hf_paraphrase-multilingual-mpnet-base-v2
 hf_bge-large-zh-v1.5
@@ -46,7 +45,6 @@ hf_msmarco-distilbert-base-v4
 hf_GIST-Embedding-v0
 hf_msmarco-distilbert-base-tas-b
 hf_paraphrase-mpnet-base-v2
-hf_wavlm-base-plus
 hf_GIST-large-Embedding-v0
 hf_gte-small
 hf_paraphrase-MiniLM-L3-v2
diff --git a/alt_e2eshark/onnx_tests/models/hf_models.py b/alt_e2eshark/onnx_tests/models/hf_models.py
@@ -5,6 +5,7 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 import requests
+import torch
 
 from pathlib import Path
 
@@ -14,11 +15,6 @@
 
 from transformers import (
     AutoTokenizer,
-    BartTokenizer,
-    BertTokenizer,
-    PhobertTokenizer,
-    RobertaTokenizer,
-    XLMRobertaTokenizer,
 )
 
 from torchvision import transforms
@@ -41,6 +37,7 @@
     "object-detection",
     "image-segmentation",
     "semantic-segmentation",
+    "audio-classification",
 ]
 
 # These are NLP model names that have a mismatch between tokenizer
@@ -148,26 +145,13 @@
 
 
 def get_tokenizer_from_model_path(model_repo_path: str, cache_dir: str | Path):
-    name = model_repo_path.split("/")[-1]
-    if "deberta" in name.lower():
-        return AutoTokenizer.from_pretrained(model_repo_path, cache_dir=cache_dir)
-
-    if "bart" in name.lower():
-        return BartTokenizer.from_pretrained(model_repo_path, cache_dir=cache_dir)
-
-    if "xlm" in name.lower() and "roberta" in name.lower():
-        return XLMRobertaTokenizer.from_pretrained(model_repo_path, cache_dir=cache_dir)
+    trust_remote_code = False
 
-    if "roberta" in name.lower():
-        return RobertaTokenizer.from_pretrained(model_repo_path, cache_dir=cache_dir)
-
-    if "phobert" in name.lower():
-        return PhobertTokenizer.from_pretrained(model_repo_path, cache_dir=cache_dir)
-
-    if "bert" in name.lower():
-        return BertTokenizer.from_pretrained(model_repo_path, cache_dir=cache_dir)
+    name = model_repo_path.split("/")[-1]
+    if 'kobert' in name.lower():
+        trust_remote_code = True
 
-    return AutoTokenizer.from_pretrained(model_repo_path, cache_dir=cache_dir)
+    return AutoTokenizer.from_pretrained(model_repo_path, cache_dir=cache_dir, trust_remote_code=True)
 
 
 def build_repo_to_model_map():
@@ -215,6 +199,13 @@ def build_repo_to_model_map():
     )
 )
 
+# Meta constructor for all multiple choice models.
+meta_constructor_random_input = lambda m_name: (
+    lambda *args, **kwargs: HfModelWithRandomInput(
+        model_repo_map[m_name][0], model_repo_map[m_name][1], *args, **kwargs
+    )
+)
+
 # Meta constructor for all multiple choice models.
 meta_constructor_multiple_choice = lambda m_name: (
     lambda *args, **kwargs: HfModelMultipleChoice(
@@ -245,6 +236,21 @@ def construct_inputs(self):
         return test_tensors
 
 
+class HfModelWithRandomInput(HfDownloadableModel):
+    def export_model(self, optim_level: str | None = None):
+        # We won't need optim_level.
+        del optim_level
+        super().export_model("O1" if self.name in basic_opt else None)
+
+    def construct_inputs(self):
+        inputs = torch.randn(1, 4, 16000)
+
+        self.input_name_to_shape_map = {'input_ids': torch.Size([16000, 4]), 'attention_mask': torch.Size([16000, 4])}
+
+        test_tensors = TestTensors(inputs)
+        return test_tensors
+
+
 class HfModelMultipleChoice(HfDownloadableModel):
     def export_model(self, optim_level: str | None = None):
         # We won't need optim_level.
@@ -334,6 +340,8 @@ def setup_test_image(height=224, width=224):
             | "semantic-segmentation"
         ):
             register_test(meta_constructor_cv(t), t)
+        case "audio-classification":
+            register_test(meta_constructor_random_input(t), t)
         case "multiple-choice":
             register_test(meta_constructor_multiple_choice(t), t)
         case _:

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+ntu-spml/hf_distilhubert`
	`2`	`+microsoft/hf_wavlm-base-plus`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+hf_distilhubert`
	`2`	`+hf_wavlm-base-plus`