Skip to content

Commit 63dd07a

Browse files
committed
Temporary revert to old vocab conversion for falcon
1 parent 0bb242a commit 63dd07a

File tree

1 file changed

+24
-0
lines changed

1 file changed

+24
-0
lines changed

model.py

+24
Original file line numberDiff line numberDiff line change
@@ -563,6 +563,30 @@ def set_gguf_parameters(self):
563563
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
564564
self.gguf_writer.add_file_type(self.ftype)
565565

566+
def set_vocab(self):
567+
tokens = []
568+
scores = []
569+
toktypes = []
570+
571+
from transformers import AutoTokenizer
572+
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
573+
vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab))
574+
assert max(tokenizer.vocab.values()) < vocab_size
575+
576+
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
577+
578+
for i in range(vocab_size):
579+
tokens.append(reverse_vocab[i])
580+
scores.append(0.0) # dummy
581+
toktypes.append(gguf.TokenType.NORMAL)
582+
583+
self.gguf_writer.add_token_list(tokens)
584+
self.gguf_writer.add_token_scores(scores)
585+
self.gguf_writer.add_token_types(toktypes)
586+
587+
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges = True, n_vocab = len(tokens))
588+
special_vocab.add_to_gguf(self.gguf_writer)
589+
566590
def write_tensors(self):
567591
block_count = self.hparams.get("num_hidden_layers")
568592
if block_count is None:

0 commit comments

Comments
 (0)