Add Q8 cache option to example chatbot

turboderp · turboderp · commit f6abbba18317 · 2024-06-08T22:40:12.000+02:00
diff --git a/examples/chat.py b/examples/chat.py
@@ -8,6 +8,7 @@
     ExLlamaV2Cache,
     ExLlamaV2Cache_8bit,
     ExLlamaV2Cache_Q4,
+    ExLlamaV2Cache_Q8,
     ExLlamaV2Tokenizer,
     model_init,
 )
@@ -54,6 +55,7 @@
 
 parser.add_argument("-c8", "--cache_8bit", action = "store_true", help = "Use 8-bit (FP8) cache")
 parser.add_argument("-cq4", "--cache_q4", action = "store_true", help = "Use Q4 cache")
+parser.add_argument("-cq8", "--cache_q8", action = "store_true", help = "Use Q8 cache")
 
 parser.add_argument("-ngram", "--ngram_decoding", action = "store_true", help = "Use n-gram speculative decoding")
 
@@ -128,6 +130,8 @@
         draft_cache = ExLlamaV2Cache_8bit(draft_model)
     elif args.cache_q4:
         draft_cache = ExLlamaV2Cache_Q4(draft_model)
+    elif args.cache_q8:
+        draft_cache = ExLlamaV2Cache_Q8(draft_model)
     else:
         draft_cache = ExLlamaV2Cache(draft_model)
 
@@ -137,6 +141,8 @@
     cache = ExLlamaV2Cache_8bit(model, lazy = not model.loaded)
 elif args.cache_q4:
     cache = ExLlamaV2Cache_Q4(model, lazy = not model.loaded)
+elif args.cache_q8:
+    cache = ExLlamaV2Cache_Q8(model, lazy = not model.loaded)
 else:
     cache = ExLlamaV2Cache(model, lazy = not model.loaded)