Refactor tiktoken import bare except (#1024)

Matthew Hoffman · jainapurva · commit 89a8403bfa62 · 2024-10-14T20:14:01.000-07:00
* Refactor tiktoken import bare except

We shouldn't suppress the import error if tiktoken is not installed

Instead of importing it at the top of the file, we can import it inside the only functions that use these imports; we can consider a different import structure if we end up needing to access these modules in more places

This lazy importing should also decrease the load time of these modules

Also do the same thing for sentencepiece

* Remove __future__.annotations import
diff --git a/torchao/_models/llama/tokenizer.py b/torchao/_models/llama/tokenizer.py
@@ -1,14 +1,7 @@
 # copied from https://github.com/pytorch-labs/gpt-fast/blob/main/tokenizer.py
 
 import os
-import sentencepiece as spm
-try:
-    import tiktoken
-    from tiktoken.load import load_tiktoken_bpe
-except:
-    pass
 from pathlib import Path
-from typing import Dict
 
 class TokenizerInterface:
     def __init__(self, model_path):
@@ -28,6 +21,8 @@ def eos_id(self):
 
 class SentencePieceWrapper(TokenizerInterface):
     def __init__(self, model_path):
+        import sentencepiece as spm
+
         super().__init__(model_path)
         self.processor = spm.SentencePieceProcessor(str(model_path))
         self.bos_token_id = self.bos_id()
@@ -50,16 +45,19 @@ class TiktokenWrapper(TokenizerInterface):
     Tokenizing and encoding/decoding text using the Tiktoken tokenizer.
     """
 
-    special_tokens: Dict[str, int]
+    special_tokens: dict[str, int]
 
     num_reserved_special_tokens = 256
 
     pat_str = r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"  # noqa: E501
 
     def __init__(self, model_path):
+        import tiktoken
+        import tiktoken.load
+
         super().__init__(model_path)
         assert os.path.isfile(model_path), str(model_path)
-        mergeable_ranks = load_tiktoken_bpe(str(model_path))
+        mergeable_ranks = tiktoken.load.load_tiktoken_bpe(str(model_path))
         num_base_tokens = len(mergeable_ranks)
         special_tokens = [
             "<|begin_of_text|>",