fix use_exllama logic error

cyang49 · cyang49 · commit 976d36d7881d · 2024-03-22T17:44:16.000Z
diff --git a/server/text_generation_server/utils/weights.py b/server/text_generation_server/utils/weights.py
@@ -219,12 +219,13 @@ def get_multi_weights_row(self, prefix: str, quantize: str, row_perm=None, nosha
         if quantize == "gptq":
             bits, groupsize = self._get_gptq_params()
 
-            from text_generation_server.utils.layers import HAS_EXLLAMA
+            from text_generation_server.utils.layers import HAS_EXLLAMA, IS_TP_AWARE_GPTQ
             is_preshuffle = (row_perm != None)
             is_masked_matmul = noshard
             assert (is_preshuffle != is_masked_matmul
                     or not (is_preshuffle or is_masked_matmul)), f"TP-aware optimization can't both be enabled at the same time {is_preshuffle=}, {is_masked_matmul=}"
-            use_exllama = (bits == 4) and HAS_EXLLAMA or (is_preshuffle or is_masked_matmul)
+
+            use_exllama = (bits == 4) and HAS_EXLLAMA and (IS_TP_AWARE_GPTQ and (is_preshuffle or is_masked_matmul))
             if self.process_group.rank == 0:
                 if use_exllama:
                     logger.info(f"Using exllama kernels for row {prefix}")