Skip to content

Commit 7c7b199

Browse files
authored
Added draft token count as parameter to chat.py (#635)
1 parent 15e5404 commit 7c7b199

File tree

1 file changed

+2
-1
lines changed

1 file changed

+2
-1
lines changed

examples/chat.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
parser = argparse.ArgumentParser(description = "Simple Llama2 chat example for ExLlamaV2")
3434
parser.add_argument("-dm", "--draft_model_dir", type = str, default = None, help = "Path to draft model directory")
3535
parser.add_argument("-nds", "--no_draft_scale", action = "store_true", help = "If draft model has smaller context size than model, don't apply alpha (NTK) scaling to extend it")
36+
parser.add_argument("-dn", "--draft_n_tokens", type = int, default = 5, help = "How many tokens to speculate ahead (defaults to 5)")
3637

3738
parser.add_argument("-modes", "--modes", action = "store_true", help = "List available modes and exit.")
3839
parser.add_argument("-mode", "--mode", choices = prompt_formats_list, help = "Chat mode. Use llama for Llama 1/2 chat finetunes.")
@@ -219,7 +220,7 @@ def get_tokenized_context(max_len):
219220

220221
# Generator
221222

222-
generator = ExLlamaV2StreamingGenerator(model, cache, tokenizer, draft_model, draft_cache)
223+
generator = ExLlamaV2StreamingGenerator(model, cache, tokenizer, draft_model, draft_cache, num_speculative_tokens=args.draft_n_tokens)
223224
generator.speculative_ngram = args.ngram_decoding
224225

225226
settings = ExLlamaV2Sampler.Settings(

0 commit comments

Comments
 (0)