Skip to content

Commit f1bdfd4

Browse files
committed
Fixes
1 parent c31e606 commit f1bdfd4

File tree

3 files changed

+95
-1
lines changed

3 files changed

+95
-1
lines changed

convert_hf_to_gguf.py

Lines changed: 67 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -840,6 +840,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
840840
if chkhsh == "169bf0296a13c4d9b7672313f749eb36501d931022de052aad6e36f2bf34dd51":
841841
# ref: https://huggingface.co/LiquidAI/LFM2-Tokenizer
842842
res = "lfm2"
843+
if chkhsh == "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890":
844+
# ref: https://huggingface.co/moonshotai/Kimi-K2-Instruct
845+
res = "kimi-k2"
843846

844847
if res is None:
845848
logger.warning("\n")
@@ -5563,7 +5566,68 @@ class DeepseekV2Model(TextModel):
55635566
model_arch = gguf.MODEL_ARCH.DEEPSEEK2
55645567

55655568
def set_vocab(self):
5566-
self._set_vocab_gpt2()
5569+
try:
5570+
self._set_vocab_gpt2()
5571+
return
5572+
except:
5573+
pass
5574+
# Try using trust_remote_code=True
5575+
from transformers import AutoTokenizer
5576+
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
5577+
tokpre = self.get_vocab_base_pre(tokenizer)
5578+
merges = []
5579+
vocab = {}
5580+
tokens: list[str] = []
5581+
toktypes: list[int] = []
5582+
5583+
if tokpre == "kimi-k2":
5584+
# Copied from Hunyuan tokenizer conversion
5585+
# 2. Reverse-engineer the merges list from mergeable_ranks
5586+
merges = []
5587+
vocab = {}
5588+
from tiktoken.load import load_tiktoken_bpe
5589+
mergeable_ranks = load_tiktoken_bpe(tokenizer.vocab_file)
5590+
for token, rank in mergeable_ranks.items():
5591+
vocab[QwenModel.token_bytes_to_string(token)] = rank
5592+
if len(token) == 1:
5593+
continue
5594+
merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
5595+
if len(merged) == 2: # todo this is an assert in Qwen, why?
5596+
merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
5597+
5598+
# 3. Generate the tokens and toktypes lists
5599+
vocab_size = self.hparams["vocab_size"]
5600+
assert tokenizer.vocab_size == vocab_size
5601+
special_tokens = tokenizer.special_tokens
5602+
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()}
5603+
tokens: list[str] = []
5604+
toktypes: list[int] = []
5605+
for i in range(vocab_size):
5606+
if i not in reverse_vocab:
5607+
tokens.append(f"[PAD{i}]")
5608+
toktypes.append(gguf.TokenType.UNUSED)
5609+
else:
5610+
token = reverse_vocab[i]
5611+
tokens.append(token)
5612+
if i in special_tokens.values():
5613+
toktypes.append(gguf.TokenType.CONTROL)
5614+
else:
5615+
toktypes.append(gguf.TokenType.NORMAL)
5616+
5617+
# 5. Add special tokens and chat templates
5618+
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
5619+
special_vocab.add_to_gguf(self.gguf_writer)
5620+
# FIX - Kimi-K2 does not add a BOS
5621+
self.gguf_writer.add_bos_token(False)
5622+
else:
5623+
raise NotImplementedError(f"{self.dir_model} is not supported yet!")
5624+
5625+
# 4. Write all vocab-related fields to the GGUF writer
5626+
self.gguf_writer.add_tokenizer_model("gpt2")
5627+
self.gguf_writer.add_tokenizer_pre(tokpre)
5628+
self.gguf_writer.add_token_list(tokens)
5629+
self.gguf_writer.add_token_types(toktypes)
5630+
self.gguf_writer.add_token_merges(merges)
55675631

55685632
def set_gguf_parameters(self):
55695633

@@ -6973,6 +7037,8 @@ def set_vocab(self):
69737037
special_vocab.add_to_gguf(self.gguf_writer)
69747038
# FIX for BOS token: Overwrite incorrect id read from config.json
69757039
self.gguf_writer.add_bos_token_id(127959) # <|bos|>
7040+
# FIX - Hunyuan does not add a BOS
7041+
self.gguf_writer.add_bos_token(False)
69767042

69777043
def set_gguf_parameters(self):
69787044
super().set_gguf_parameters()

src/llama-vocab.cpp

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -425,6 +425,29 @@ struct llm_tokenizer_bpe : llm_tokenizer {
425425
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\\r\\n]+|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
426426
};
427427
break;
428+
case LLAMA_VOCAB_PRE_TYPE_KIMI_K2:
429+
// Same as GPT-4o tokenizer except for Han characters [\\p{Han}]+
430+
regex_exprs = {
431+
// 1. Add the high-priority Han character rule. Backslashes must be escaped.
432+
"[\\p{Han}]+",
433+
434+
// 2 & 3. Use the adapted word patterns from GPT4O/Tekken, which emulate the uppercase/lowercase logic in a C++-compatible way.
435+
// We also adapt the case-insensitive contraction to be C++ compatible.
436+
"[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?",
437+
"[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?",
438+
439+
// 4. Add the number rule.
440+
"\\p{N}{1,3}",
441+
442+
// 5. Use the Kimi K2 symbol rule precisely (no trailing '/').
443+
" ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*",
444+
445+
// 6, 7, 8. Add the identical whitespace rules.
446+
"\\s*[\\r\\n]+",
447+
"\\s+(?!\\S)",
448+
"\\s+",
449+
};
450+
break;
428451
default:
429452
// default regex for BPE tokenization pre-processing
430453
regex_exprs = {
@@ -1665,6 +1688,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
16651688
tokenizer_pre == "hunyuan") {
16661689
pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN;
16671690
clean_spaces = false;
1691+
} else if (
1692+
tokenizer_pre == "kimi-k2") {
1693+
pre_type = LLAMA_VOCAB_PRE_TYPE_KIMI_K2;
1694+
clean_spaces = false;
16681695
} else {
16691696
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
16701697
}

src/llama-vocab.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ enum llama_vocab_pre_type {
4545
LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
4646
LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
4747
LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36,
48+
LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37,
4849
};
4950

5051
struct LLM_KV;

0 commit comments

Comments
 (0)