File tree 1 file changed +24
-0
lines changed
1 file changed +24
-0
lines changed Original file line number Diff line number Diff line change @@ -563,6 +563,30 @@ def set_gguf_parameters(self):
563
563
self .gguf_writer .add_layer_norm_eps (self .hparams ["layer_norm_epsilon" ])
564
564
self .gguf_writer .add_file_type (self .ftype )
565
565
566
+ def set_vocab (self ):
567
+ tokens = []
568
+ scores = []
569
+ toktypes = []
570
+
571
+ from transformers import AutoTokenizer
572
+ tokenizer = AutoTokenizer .from_pretrained (self .dir_model )
573
+ vocab_size = self .hparams .get ("vocab_size" , len (tokenizer .vocab ))
574
+ assert max (tokenizer .vocab .values ()) < vocab_size
575
+
576
+ reverse_vocab = {id : encoded_tok for encoded_tok , id in tokenizer .vocab .items ()}
577
+
578
+ for i in range (vocab_size ):
579
+ tokens .append (reverse_vocab [i ])
580
+ scores .append (0.0 ) # dummy
581
+ toktypes .append (gguf .TokenType .NORMAL )
582
+
583
+ self .gguf_writer .add_token_list (tokens )
584
+ self .gguf_writer .add_token_scores (scores )
585
+ self .gguf_writer .add_token_types (toktypes )
586
+
587
+ special_vocab = gguf .SpecialVocab (self .dir_model , load_merges = True , n_vocab = len (tokens ))
588
+ special_vocab .add_to_gguf (self .gguf_writer )
589
+
566
590
def write_tensors (self ):
567
591
block_count = self .hparams .get ("num_hidden_layers" )
568
592
if block_count is None :
You can’t perform that action at this time.
0 commit comments