HabanaAI · kzawora-intel · Sep 12, 2024 · Sep 12, 2024 · Sep 13, 2024 · Sep 13, 2024
diff --git a/.github/workflows/cpu-test.yml b/.github/workflows/cpu-test.yml
@@ -0,0 +1,34 @@
+name: cpu-test
+
+on:
+  # Trigger the workflow on push or pull request,
+  # but only for the habana_main branch
+  push:
+    branches:
+      - habana_main
+  pull_request:
+    branches:
+      - habana_main
+
+
+jobs:
+  cputest:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.10"]
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install torch --extra-index-url https://download.pytorch.org/whl/cpu
+        pip install -r requirements-hpu.txt
+        VLLM_TARGET_DEVICE=hpu python setup.py develop
+    - name: cpu-test
+      run: |
+        VLLM_SKIP_WARMUP=true VLLM_PROMPT_SEQ_BUCKET_MAX=128 VLLM_USE_FAKE_HPU=1 python examples/offline_inference_fakehpu.py
diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
@@ -50,6 +50,5 @@ jobs:
         mypy vllm/transformers_utils --config-file pyproject.toml
         mypy vllm/usage --config-file pyproject.toml
         mypy vllm/worker --config-file pyproject.toml
-        mypy vllm/hpu --config-file pyproject.toml
 
 
diff --git a/README_GAUDI.md b/README_GAUDI.md
@@ -81,14 +81,15 @@ Supported Features
 -   Inference with [HPU
     Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html)
     for accelerating low-batch latency and throughput
+-   INC quantization
 
 Unsupported Features
 ====================
 
 -   Beam search
 -   LoRA adapters
 -   Attention with Linear Biases (ALiBi)
--   Quantization (AWQ, FP8 E5M2, FP8 E4M3)
+-   AWQ quantization
 -   Prefill chunking (mixed-batch inferencing)
 
 Supported Configurations
@@ -315,9 +316,9 @@ mark 90% of free device memory at that point as usable. Next, KV cache
 gets allocated, model is warmed up, and HPU Graphs are captured.
 Environment variable `VLLM_GRAPH_RESERVED_MEM` defines the ratio of
 memory reserved for HPU Graphs capture. With its default value
-(`VLLM_GRAPH_RESERVED_MEM=0.4`), 40% of usable memory will be reserved
+(`VLLM_GRAPH_RESERVED_MEM=0.1`), 10% of usable memory will be reserved
 for graph capture (later referred to as \"usable graph memory\"), and
-the remaining 60% will be utilized for KV cache. Environment variable
+the remaining 90% will be utilized for KV cache. Environment variable
 `VLLM_GRAPH_PROMPT_RATIO` determines the ratio of usable graph memory
 reserved for prefill and decode graphs. By default
 (`VLLM_GRAPH_PROMPT_RATIO=0.5`), both stages have equal memory
@@ -445,7 +446,7 @@ Environment variables
 -   `VLLM_SKIP_WARMUP`: if `true`, warmup will be skipped, `false` by
     default
 -   `VLLM_GRAPH_RESERVED_MEM`: percentage of memory dedicated for
-    HPUGraph capture, `0.4` by default
+    HPUGraph capture, `0.1` by default
 -   `VLLM_GRAPH_PROMPT_RATIO`: percentage of reserved graph memory
     dedicated for prompt graphs, `0.5` by default
 -   `VLLM_GRAPH_PROMPT_STRATEGY`: strategy determining order of prompt

diff --git a/docs/source/getting_started/gaudi-installation.rst b/docs/source/getting_started/gaudi-installation.rst
@@ -76,14 +76,15 @@ Supported Features
 -  Tensor parallelism support for multi-card inference
 -  Inference with `HPU Graphs <https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html>`__
    for accelerating low-batch latency and throughput
+-  INC quantization
 
 Unsupported Features
 ====================
 
 -  Beam search
 -  LoRA adapters
 -  Attention with Linear Biases (ALiBi)
--  Quantization (AWQ, FP8 E5M2, FP8 E4M3)
+-  AWQ quantization
 -  Prefill chunking (mixed-batch inferencing)
 
 Supported Configurations
@@ -243,7 +244,7 @@ Before KV cache gets allocated, model weights are loaded onto the device, and a
 Only after that, ``gpu_memory_utilization`` flag is utilized - at its default value,  will mark 90% of free device memory at that point as usable.
 Next, KV cache gets allocated, model is warmed up, and HPU Graphs are captured. 
 Environment variable ``VLLM_GRAPH_RESERVED_MEM`` defines the ratio of memory reserved for HPU Graphs capture. 
-With its default value (``VLLM_GRAPH_RESERVED_MEM=0.4``), 40% of usable memory will be reserved for graph capture (later referred to as "usable graph memory"), and the remaining 60% will be utilized for KV cache. 
+With its default value (``VLLM_GRAPH_RESERVED_MEM=0.1``), 10% of usable memory will be reserved for graph capture (later referred to as "usable graph memory"), and the remaining 90% will be utilized for KV cache. 
 Environment variable ``VLLM_GRAPH_PROMPT_RATIO`` determines the ratio of usable graph memory reserved for prefill and decode graphs. By default (``VLLM_GRAPH_PROMPT_RATIO=0.5``), both stages have equal memory constraints. 
 Lower value corresponds to less usable graph memory reserved for prefill stage, e.g. ``VLLM_GRAPH_PROMPT_RATIO=0.2`` will reserve 20% of usable graph memory for prefill graphs, and 80% of usable graph memory for decode graphs. 
 
@@ -322,7 +323,7 @@ Environment variables
 **Performance tuning knobs:**
 
 -   ``VLLM_SKIP_WARMUP``: if ``true``, warmup will be skipped, ``false`` by default
--   ``VLLM_GRAPH_RESERVED_MEM``: percentage of memory dedicated for HPUGraph capture, ``0.4`` by default
+-   ``VLLM_GRAPH_RESERVED_MEM``: percentage of memory dedicated for HPUGraph capture, ``0.1`` by default
 -   ``VLLM_GRAPH_PROMPT_RATIO``: percentage of reserved graph memory dedicated for prompt graphs, ``0.5`` by default
 -   ``VLLM_GRAPH_PROMPT_STRATEGY``: strategy determining order of prompt graph capture, ``min_tokens`` or ``max_bs``, ``min_tokens`` by default
 -   ``VLLM_GRAPH_DECODE_STRATEGY``: strategy determining order of decode graph capture, ``min_tokens`` or ``max_bs``, ``max_bs`` by default

diff --git a/examples/offline_inference_fakehpu.py b/examples/offline_inference_fakehpu.py
@@ -0,0 +1,38 @@
+import os
+
+from vllm import LLM, SamplingParams
+
+if os.environ.get('VLLM_USE_FAKE_HPU', '0') != '0':
+    from vllm.utils import migrate_to_cpu
+    migrate_to_cpu()
+
+# Sample prompts.
+prompts = [
+    "Berlin is the capital city of ",
+    "Louvre is located in the city of ",
+    "Barack Obama was the 44th president of ",
+    "Warsaw is the capital city of ",
+    "Gniezno is a city in ",
+    "San Francisco is located in the state of ",
+    "Llanfairpwllgwyngyll is located in country of ",
+]
+ref_answers = [
+    "Germany", "Paris", "United States", "Poland", "Poland", "California",
+    "Wales"
+]
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0, n=1, use_beam_search=False)
+
+# Create an LLM.
+llm = LLM(model="facebook/opt-125m", max_model_len=32, max_num_seqs=4)
+# Generate texts from the prompts. The output is a list of RequestOutput objects
+# that contain the prompt, generated text, and other information.
+outputs = llm.generate(prompts, sampling_params)
+# Print the outputs.
+for output, answer in zip(outputs, ref_answers):
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    assert answer in generated_text, (
+        f"The generated text does not contain the correct answer: {answer}")
+print('PASSED')
diff --git a/format.sh b/format.sh
@@ -113,7 +113,6 @@ mypy vllm/spec_decode --config-file pyproject.toml
 mypy vllm/transformers_utils --config-file pyproject.toml
 mypy vllm/usage --config-file pyproject.toml
 mypy vllm/worker --config-file pyproject.toml
-mypy vllm/hpu --config-file pyproject.toml
 
 
 # If git diff returns a file that is in the skip list, the file may be checked anyway:

diff --git a/requirements-hpu.txt b/requirements-hpu.txt
@@ -6,3 +6,4 @@ ray == 2.32.0
 triton
 pandas
 tabulate
+vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@30ee2d1
diff --git a/tests/lora/test_lora_hpu.py b/tests/lora/test_lora_hpu.py
@@ -1,7 +1,7 @@
 import pytest
 import torch
+from vllm_hpu_extension.ops import LoraMask
 
-from vllm.hpu.ops import LoraMask
 from vllm.lora.layers import _apply_lora, _apply_lora_packed_nslice
 
 from .utils import DummyLoRAManager

diff --git a/vllm/__init__.py b/vllm/__init__.py
@@ -1,4 +1,8 @@
 """vLLM: a high-throughput and memory-efficient inference engine for LLMs"""
+from vllm.utils import is_fake_hpu, migrate_to_cpu
+
+if is_fake_hpu():
+    migrate_to_cpu()
 
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine

diff --git a/vllm/attention/backends/habana_attn.py b/vllm/attention/backends/habana_attn.py
@@ -7,14 +7,14 @@
 from typing import Any, Dict, List, Optional, Tuple, Type
 
 import torch
+import vllm_hpu_extension.ops as ops
+from vllm_hpu_extension import cache_ops
+from vllm_hpu_extension.utils import Matmul, Softmax, VLLMKVCache
 
-import vllm.hpu.ops as ops
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionMetadata, AttentionType)
 from vllm.attention.ops.habana_paged_attn import (HabanaPagedAttention,
                                                   HabanaPagedAttentionMetadata)
-from vllm.hpu import cache_ops
-from vllm.hpu.utils import Matmul, Softmax, VLLMKVCache
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)

diff --git a/vllm/attention/ops/habana_paged_attn.py b/vllm/attention/ops/habana_paged_attn.py
@@ -6,8 +6,7 @@
 from typing import Dict, List, Optional, Tuple
 
 import torch
-
-from vllm.hpu import cache_ops, ops
+from vllm_hpu_extension import cache_ops, ops
 
 # Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`.
 _PARTITION_SIZE = 512

diff --git a/vllm/executor/ray_habana_executor.py b/vllm/executor/ray_habana_executor.py
@@ -13,7 +13,7 @@
 from vllm.utils import (_run_task_with_lock,
                         error_on_invalid_device_count_status,
                         get_distributed_init_method, get_ip, get_open_port,
-                        get_vllm_instance_id, make_async)
+                        get_vllm_instance_id, is_fake_hpu, make_async)
 
 if ray is not None:
     from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
@@ -87,18 +87,20 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
         driver_ip = get_ip()
         worker_wrapper_kwargs = self._get_worker_wrapper_args()
         for bundle_id, bundle in enumerate(placement_group.bundle_specs):
-            if not bundle.get("HPU", 0):
+            resource_name = "HPU" if not is_fake_hpu() else "CPU"
+            if not bundle.get(resource_name, 0):
                 continue
             scheduling_strategy = PlacementGroupSchedulingStrategy(
                 placement_group=placement_group,
                 placement_group_capture_child_tasks=True,
                 placement_group_bundle_index=bundle_id,
             )
-
+            resources = {'HPU': num_gpus} if not is_fake_hpu() else {}
+            num_cpus = 0 if not is_fake_hpu() else num_gpus
             worker = ray.remote(
-                num_cpus=0,
+                num_cpus=num_cpus,
                 num_gpus=0,
-                resources={'HPU': num_gpus},
+                resources=resources,
                 scheduling_strategy=scheduling_strategy,
                 **ray_remote_kwargs,
             )(RayWorkerWrapper).remote(**worker_wrapper_kwargs)

diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
@@ -3,7 +3,8 @@
 from vllm.config import ParallelConfig
 from vllm.logger import init_logger
 from vllm.sequence import ExecuteModelRequest
-from vllm.utils import get_ip, is_hip, is_hpu, is_tpu, is_xpu
+from vllm.utils import (get_ip, hpu_device_string, is_hip, is_hpu, is_tpu,
+                        is_xpu)
 from vllm.worker.worker_base import WorkerWrapperBase
 
 logger = init_logger(__name__)
@@ -97,7 +98,7 @@ def initialize_ray_cluster(
     if is_tpu():
         device_str = "TPU"
     elif is_hpu():
-        device_str = "HPU"
+        device_str = hpu_device_string().upper()
     # Create placement group for worker processes
     current_placement_group = ray.util.get_current_placement_group()
     if current_placement_group:

diff --git a/vllm/hpu/__init__.py b/vllm/hpu/__init__.py
diff --git a/vllm/hpu/cache_ops.py b/vllm/hpu/cache_ops.py
-Original file line number
+Diff line change
@@ Expand Up / @@ -6,3 +6,4 @@ ray == 2.32.0 @@
     triton
     pandas
     tabulate
+    vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@30ee2d1