Skip to content

Commit 976d36d

Browse files
committed
fix use_exllama logic error
1 parent 539375e commit 976d36d

File tree

1 file changed

+3
-2
lines changed

1 file changed

+3
-2
lines changed

server/text_generation_server/utils/weights.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -219,12 +219,13 @@ def get_multi_weights_row(self, prefix: str, quantize: str, row_perm=None, nosha
219219
if quantize == "gptq":
220220
bits, groupsize = self._get_gptq_params()
221221

222-
from text_generation_server.utils.layers import HAS_EXLLAMA
222+
from text_generation_server.utils.layers import HAS_EXLLAMA, IS_TP_AWARE_GPTQ
223223
is_preshuffle = (row_perm != None)
224224
is_masked_matmul = noshard
225225
assert (is_preshuffle != is_masked_matmul
226226
or not (is_preshuffle or is_masked_matmul)), f"TP-aware optimization can't both be enabled at the same time {is_preshuffle=}, {is_masked_matmul=}"
227-
use_exllama = (bits == 4) and HAS_EXLLAMA or (is_preshuffle or is_masked_matmul)
227+
228+
use_exllama = (bits == 4) and HAS_EXLLAMA and (IS_TP_AWARE_GPTQ and (is_preshuffle or is_masked_matmul))
228229
if self.process_group.rank == 0:
229230
if use_exllama:
230231
logger.info(f"Using exllama kernels for row {prefix}")

0 commit comments

Comments
 (0)