From 7a16eb915ca34cac44ad54fc44e6107507f1de94 Mon Sep 17 00:00:00 2001 From: Linoy Buchnik Date: Mon, 24 Feb 2025 15:04:11 +0200 Subject: [PATCH] Recalc scales from user (#774) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mul scale input in factor = 448/240 --------- Co-authored-by: MichaƂ Kuligowski --- requirements-hpu.txt | 2 +- .../schemes/compressed_tensors_w8a8_fp8.py | 7 +++++-- .../model_executor/layers/quantization/utils/w8a8_utils.py | 7 +++---- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/requirements-hpu.txt b/requirements-hpu.txt index 8ca3b8caf05aa..9096c8deecf5f 100644 --- a/requirements-hpu.txt +++ b/requirements-hpu.txt @@ -8,4 +8,4 @@ pandas tabulate setuptools>=61 setuptools-scm>=8 -vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@8087a98 +vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@bb47de4 diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py index 182fdb111e6a8..5d9e8169c82ae 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py @@ -5,6 +5,7 @@ import torch from compressed_tensors.quantization import QuantizationStrategy from torch.nn import Parameter +from vllm_hpu_extension.ops import get_hpu_gaudi2_scale_factor, is_hpu_gaudi2 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( CompressedTensorsScheme) @@ -84,8 +85,10 @@ def process_weights_after_loading(self, layer) -> None: # INPUT SCALE if self.is_static_input_scheme and hasattr(layer, 'input_scale'): - layer.input_scale = Parameter(layer.input_scale.max(), - requires_grad=False) + input_scale = layer.input_scale.max() + if is_hpu_gaudi2(): + input_scale = input_scale * get_hpu_gaudi2_scale_factor() + layer.input_scale = Parameter(input_scale, requires_grad=False) else: layer.input_scale = None diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py index d79c4f69ee5fb..a7033198bdcb3 100644 --- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py @@ -3,6 +3,7 @@ from typing import List, Optional, Tuple, Union import torch +from vllm_hpu_extension.ops import get_hpu_gaudi2_scale_factor, is_hpu_gaudi2 from vllm import _custom_ops as ops from vllm.platforms import current_platform @@ -101,10 +102,8 @@ def requantize_with_max_scale( logical_widths: List[int]) -> Tuple[torch.Tensor, torch.Tensor]: # Max scale to be used for requanitzation. max_w_scale = weight_scale.max() - if current_platform.is_hpu() and htexp._get_device_type( - ) == htexp.synDeviceType.synDeviceGaudi2: - max_w_scale = max_w_scale * (torch.finfo(torch.float8_e4m3fn).max / - torch.finfo(torch.float8_e4m3fnuz).max) + if is_hpu_gaudi2(): + max_w_scale = max_w_scale * get_hpu_gaudi2_scale_factor() # QKV / MLP is fused in the on disk checkpoint if any of the # weight scales are still set to the default since we initialize # N weight scales for N shards but we only load 1 weight scale