Skip to content

Commit

Permalink
Recalc scales from user (#774)
Browse files Browse the repository at this point in the history
mul scale input in factor = 448/240

---------

Co-authored-by: Michał Kuligowski <[email protected]>
  • Loading branch information
linoybu and michalkuligowski authored Feb 24, 2025
1 parent f6441f3 commit 7a16eb9
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 7 deletions.
2 changes: 1 addition & 1 deletion requirements-hpu.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@ pandas
tabulate
setuptools>=61
setuptools-scm>=8
vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@8087a98
vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@bb47de4
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import torch
from compressed_tensors.quantization import QuantizationStrategy
from torch.nn import Parameter
from vllm_hpu_extension.ops import get_hpu_gaudi2_scale_factor, is_hpu_gaudi2

from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
CompressedTensorsScheme)
Expand Down Expand Up @@ -84,8 +85,10 @@ def process_weights_after_loading(self, layer) -> None:

# INPUT SCALE
if self.is_static_input_scheme and hasattr(layer, 'input_scale'):
layer.input_scale = Parameter(layer.input_scale.max(),
requires_grad=False)
input_scale = layer.input_scale.max()
if is_hpu_gaudi2():
input_scale = input_scale * get_hpu_gaudi2_scale_factor()
layer.input_scale = Parameter(input_scale, requires_grad=False)
else:
layer.input_scale = None

Expand Down
7 changes: 3 additions & 4 deletions vllm/model_executor/layers/quantization/utils/w8a8_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import List, Optional, Tuple, Union

import torch
from vllm_hpu_extension.ops import get_hpu_gaudi2_scale_factor, is_hpu_gaudi2

from vllm import _custom_ops as ops
from vllm.platforms import current_platform
Expand Down Expand Up @@ -101,10 +102,8 @@ def requantize_with_max_scale(
logical_widths: List[int]) -> Tuple[torch.Tensor, torch.Tensor]:
# Max scale to be used for requanitzation.
max_w_scale = weight_scale.max()
if current_platform.is_hpu() and htexp._get_device_type(
) == htexp.synDeviceType.synDeviceGaudi2:
max_w_scale = max_w_scale * (torch.finfo(torch.float8_e4m3fn).max /
torch.finfo(torch.float8_e4m3fnuz).max)
if is_hpu_gaudi2():
max_w_scale = max_w_scale * get_hpu_gaudi2_scale_factor()
# QKV / MLP is fused in the on disk checkpoint if any of the
# weight scales are still set to the default since we initialize
# N weight scales for N shards but we only load 1 weight scale
Expand Down

0 comments on commit 7a16eb9

Please sign in to comment.