Skip to content

Commit f6abbba

Browse files
committed
Add Q8 cache option to example chatbot
1 parent 6030517 commit f6abbba

File tree

1 file changed

+6
-0
lines changed

1 file changed

+6
-0
lines changed

examples/chat.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
ExLlamaV2Cache,
99
ExLlamaV2Cache_8bit,
1010
ExLlamaV2Cache_Q4,
11+
ExLlamaV2Cache_Q8,
1112
ExLlamaV2Tokenizer,
1213
model_init,
1314
)
@@ -54,6 +55,7 @@
5455

5556
parser.add_argument("-c8", "--cache_8bit", action = "store_true", help = "Use 8-bit (FP8) cache")
5657
parser.add_argument("-cq4", "--cache_q4", action = "store_true", help = "Use Q4 cache")
58+
parser.add_argument("-cq8", "--cache_q8", action = "store_true", help = "Use Q8 cache")
5759

5860
parser.add_argument("-ngram", "--ngram_decoding", action = "store_true", help = "Use n-gram speculative decoding")
5961

@@ -128,6 +130,8 @@
128130
draft_cache = ExLlamaV2Cache_8bit(draft_model)
129131
elif args.cache_q4:
130132
draft_cache = ExLlamaV2Cache_Q4(draft_model)
133+
elif args.cache_q8:
134+
draft_cache = ExLlamaV2Cache_Q8(draft_model)
131135
else:
132136
draft_cache = ExLlamaV2Cache(draft_model)
133137

@@ -137,6 +141,8 @@
137141
cache = ExLlamaV2Cache_8bit(model, lazy = not model.loaded)
138142
elif args.cache_q4:
139143
cache = ExLlamaV2Cache_Q4(model, lazy = not model.loaded)
144+
elif args.cache_q8:
145+
cache = ExLlamaV2Cache_Q8(model, lazy = not model.loaded)
140146
else:
141147
cache = ExLlamaV2Cache(model, lazy = not model.loaded)
142148

0 commit comments

Comments
 (0)