1
1
import json
2
2
from pathlib import Path
3
+ from typing import Tuple
3
4
4
5
from tokenizers import Tokenizer , AddedToken
5
6
6
7
from fastembed .image .transform .operators import Compose
7
8
8
9
9
- def load_tokenizer (model_dir : Path , max_length : int = 512 ) -> Tokenizer :
10
+ def load_special_tokens (model_dir : Path ) -> dict :
11
+ tokens_map_path = model_dir / "special_tokens_map.json"
12
+ if not tokens_map_path .exists ():
13
+ raise ValueError (f"Could not find special_tokens_map.json in { model_dir } " )
14
+
15
+ with open (str (tokens_map_path )) as tokens_map_file :
16
+ tokens_map = json .load (tokens_map_file )
17
+
18
+ return tokens_map
19
+
20
+
21
+ def load_tokenizer (model_dir : Path , max_length : int = 512 ) -> Tuple [Tokenizer , dict ]:
10
22
config_path = model_dir / "config.json"
11
23
if not config_path .exists ():
12
24
raise ValueError (f"Could not find config.json in { model_dir } " )
@@ -19,18 +31,13 @@ def load_tokenizer(model_dir: Path, max_length: int = 512) -> Tokenizer:
19
31
if not tokenizer_config_path .exists ():
20
32
raise ValueError (f"Could not find tokenizer_config.json in { model_dir } " )
21
33
22
- tokens_map_path = model_dir / "special_tokens_map.json"
23
- if not tokens_map_path .exists ():
24
- raise ValueError (f"Could not find special_tokens_map.json in { model_dir } " )
25
-
26
34
with open (str (config_path )) as config_file :
27
35
config = json .load (config_file )
28
36
29
37
with open (str (tokenizer_config_path )) as tokenizer_config_file :
30
38
tokenizer_config = json .load (tokenizer_config_file )
31
39
32
- with open (str (tokens_map_path )) as tokens_map_file :
33
- tokens_map = json .load (tokens_map_file )
40
+ tokens_map = load_special_tokens (model_dir )
34
41
35
42
tokenizer = Tokenizer .from_file (str (tokenizer_path ))
36
43
tokenizer .enable_truncation (max_length = min (tokenizer_config ["model_max_length" ], max_length ))
@@ -44,7 +51,16 @@ def load_tokenizer(model_dir: Path, max_length: int = 512) -> Tokenizer:
44
51
elif isinstance (token , dict ):
45
52
tokenizer .add_special_tokens ([AddedToken (** token )])
46
53
47
- return tokenizer
54
+ special_token_to_id = {}
55
+
56
+ for token in tokens_map .values ():
57
+ if isinstance (token , str ):
58
+ special_token_to_id [token ] = tokenizer .token_to_id (token )
59
+ elif isinstance (token , dict ):
60
+ token_str = token .get ("content" , "" )
61
+ special_token_to_id [token_str ] = tokenizer .token_to_id (token_str )
62
+
63
+ return tokenizer , special_token_to_id
48
64
49
65
50
66
def load_preprocessor (model_dir : Path ) -> Compose :
0 commit comments