@@ -840,6 +840,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
840
840
if chkhsh == "169bf0296a13c4d9b7672313f749eb36501d931022de052aad6e36f2bf34dd51" :
841
841
# ref: https://huggingface.co/LiquidAI/LFM2-Tokenizer
842
842
res = "lfm2"
843
+ if chkhsh == "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890" :
844
+ # ref: https://huggingface.co/moonshotai/Kimi-K2-Instruct
845
+ res = "kimi-k2"
843
846
844
847
if res is None :
845
848
logger .warning ("\n " )
@@ -5563,7 +5566,68 @@ class DeepseekV2Model(TextModel):
5563
5566
model_arch = gguf .MODEL_ARCH .DEEPSEEK2
5564
5567
5565
5568
def set_vocab (self ):
5566
- self ._set_vocab_gpt2 ()
5569
+ try :
5570
+ self ._set_vocab_gpt2 ()
5571
+ return
5572
+ except :
5573
+ pass
5574
+ # Try using trust_remote_code=True
5575
+ from transformers import AutoTokenizer
5576
+ tokenizer = AutoTokenizer .from_pretrained (self .dir_model , trust_remote_code = True )
5577
+ tokpre = self .get_vocab_base_pre (tokenizer )
5578
+ merges = []
5579
+ vocab = {}
5580
+ tokens : list [str ] = []
5581
+ toktypes : list [int ] = []
5582
+
5583
+ if tokpre == "kimi-k2" :
5584
+ # Copied from Hunyuan tokenizer conversion
5585
+ # 2. Reverse-engineer the merges list from mergeable_ranks
5586
+ merges = []
5587
+ vocab = {}
5588
+ from tiktoken .load import load_tiktoken_bpe
5589
+ mergeable_ranks = load_tiktoken_bpe (tokenizer .vocab_file )
5590
+ for token , rank in mergeable_ranks .items ():
5591
+ vocab [QwenModel .token_bytes_to_string (token )] = rank
5592
+ if len (token ) == 1 :
5593
+ continue
5594
+ merged = QwenModel .bpe (mergeable_ranks , token , max_rank = rank )
5595
+ if len (merged ) == 2 : # todo this is an assert in Qwen, why?
5596
+ merges .append (' ' .join (map (QwenModel .token_bytes_to_string , merged )))
5597
+
5598
+ # 3. Generate the tokens and toktypes lists
5599
+ vocab_size = self .hparams ["vocab_size" ]
5600
+ assert tokenizer .vocab_size == vocab_size
5601
+ special_tokens = tokenizer .special_tokens
5602
+ reverse_vocab = {id_ : encoded_tok for encoded_tok , id_ in {** vocab , ** special_tokens }.items ()}
5603
+ tokens : list [str ] = []
5604
+ toktypes : list [int ] = []
5605
+ for i in range (vocab_size ):
5606
+ if i not in reverse_vocab :
5607
+ tokens .append (f"[PAD{ i } ]" )
5608
+ toktypes .append (gguf .TokenType .UNUSED )
5609
+ else :
5610
+ token = reverse_vocab [i ]
5611
+ tokens .append (token )
5612
+ if i in special_tokens .values ():
5613
+ toktypes .append (gguf .TokenType .CONTROL )
5614
+ else :
5615
+ toktypes .append (gguf .TokenType .NORMAL )
5616
+
5617
+ # 5. Add special tokens and chat templates
5618
+ special_vocab = gguf .SpecialVocab (self .dir_model , load_merges = False )
5619
+ special_vocab .add_to_gguf (self .gguf_writer )
5620
+ # FIX - Kimi-K2 does not add a BOS
5621
+ self .gguf_writer .add_bos_token (False )
5622
+ else :
5623
+ raise NotImplementedError (f"{ self .dir_model } is not supported yet!" )
5624
+
5625
+ # 4. Write all vocab-related fields to the GGUF writer
5626
+ self .gguf_writer .add_tokenizer_model ("gpt2" )
5627
+ self .gguf_writer .add_tokenizer_pre (tokpre )
5628
+ self .gguf_writer .add_token_list (tokens )
5629
+ self .gguf_writer .add_token_types (toktypes )
5630
+ self .gguf_writer .add_token_merges (merges )
5567
5631
5568
5632
def set_gguf_parameters (self ):
5569
5633
@@ -6973,6 +7037,8 @@ def set_vocab(self):
6973
7037
special_vocab .add_to_gguf (self .gguf_writer )
6974
7038
# FIX for BOS token: Overwrite incorrect id read from config.json
6975
7039
self .gguf_writer .add_bos_token_id (127959 ) # <|bos|>
7040
+ # FIX - Hunyuan does not add a BOS
7041
+ self .gguf_writer .add_bos_token (False )
6976
7042
6977
7043
def set_gguf_parameters (self ):
6978
7044
super ().set_gguf_parameters ()
0 commit comments