Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions tests/e2e/singlecard/test_aclgraph_accuracy.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,29 @@
],
)

CASE_QWEN_FULL = LLMTestCase(
model="Qwen/Qwen3-0.6B",
prompts=PROMPTS_SHORT,
golden_answers=[
" Lina. I'm a 22-year-old student from China. I'm interested in studying in the US. I want to know if there are any",
' the same as the president of the United Nations. This is because the president of the United States is the same as the president of the United Nations. The president',
' Paris. The capital of France is also the capital of the Republic of France. The capital of France is also the capital of the European Union. The capital of',
' not just a technological frontier but a profound transformation of how we live, work, and interact with the world. As we stand at the intersection of artificial intelligence and'
],
)

CASE_DS_FULL = LLMTestCase(
model="vllm-ascend/DeepSeek-V2-Lite-W8A8",
quantization="ascend",
prompts=PROMPTS_SHORT,
golden_answers=[
'\nI am a 20 year old female, and I have been suffering from depression for 3 years now. I have been on medication for 2',
' a man who has been in the public eye for decades. He has been a senator, a governor, and a businessman. He has also been married to the',
' Paris, which is also the largest city in the country. The city is located on the River Seine and is known for its beautiful architecture, museums, and art',
' here, and it’s not what you think.\nThe future of AI is here, and it’s not what you think.\nThe future of'
],
)

CASE_QWEN_FULL_DECODE_ONLY = LLMTestCase(
model="Qwen/Qwen3-0.6B",
prompts=PROMPTS_LONG,
Expand Down Expand Up @@ -93,6 +116,23 @@ def test_piecewise_res_consistency(cur_case: LLMTestCase):
sampling_params=cur_case.sampling_params,
golden_answers=cur_case.golden_answers)

@pytest.mark.parametrize(
"cur_case", [CASE_QWEN_FULL, CASE_DS_FULL])
def test_full_res_consistency(cur_case: LLMTestCase, monkeypatch):
monkeypatch.delenv("HCCL_OP_EXPANSION_MODE", raising=False)
runner_kwargs = {
"model_name": cur_case.model,
"max_model_len": 1024,
"compilation_config": {
"cudagraph_capture_sizes": [4, 8, 32, 64],
"cudagraph_mode": "FULL_DECODE_ONLY"
},
"quantization": cur_case.quantization,
}
gen_and_valid(runner_kwargs=runner_kwargs,
prompts=cur_case.prompts,
sampling_params=cur_case.sampling_params,
golden_answers=cur_case.golden_answers)

@pytest.mark.parametrize(
"cur_case", [CASE_QWEN_FULL_DECODE_ONLY, CASE_DS_FULL_DECODE_ONLY])
Expand Down
61 changes: 56 additions & 5 deletions vllm_ascend/worker/model_runner_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,13 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device):
vllm_config.scheduler_config.max_num_batched_tokens += max_pcp_pad_tokens
with _torch_cuda_wrapper():
super().__init__(vllm_config, device)

# NOTE: For FULL mode we change +1 to +2 to reserve extra space for padding.
# See _pad_query_start_loc_for_fia.
self.query_start_loc = self._make_buffer(
self.max_num_reqs + 2, dtype=torch.int32 # type: ignore[has-type]
)
Comment on lines +210 to +214
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Check if other buffers should be extended too.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This strange, no error by now.


vllm_config.scheduler_config.max_num_batched_tokens -= max_pcp_pad_tokens
self.max_num_tokens = self.scheduler_config.max_num_batched_tokens
self.max_num_reqs = self.scheduler_config.max_num_seqs
Expand Down Expand Up @@ -509,6 +516,36 @@ def get_model(self) -> nn.Module:
return self.model.unwrap()
return self.model

def _pad_query_start_loc_for_fia(
self, num_tokens_padded: int, num_reqs_padded: int, num_reqs: int
) -> int:
"""
This function is only designed to satisfied the constraint that when the layout is TND,
the first dimension of `hidden_states` must equal the last element of `actual_seq_lengths_q`.
"""

if num_tokens_padded == num_reqs_padded * self.uniform_decode_query_len:
# Uniform-batch case: num_reqs must be no greater than num_reqs_padded
assert num_reqs <= num_reqs_padded

last_loc = self.query_start_loc.np[num_reqs]
self.query_start_loc.np[num_reqs + 1 : num_reqs_padded + 1] = (
self.arange_np[1 : num_reqs_padded + 1 - num_reqs]
* self.uniform_decode_query_len
+ last_loc
)
Comment on lines +527 to +536
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[0, 1, 2] -> [0, 1, 2, 3, 4]

else:
# Mixed-batch case: num_reqs must equal num_reqs_padded
assert num_reqs == num_reqs_padded

# Insert a dummy request instead of setting query_start_loc[num_reqs] = num_tokens_padded directly
self.query_start_loc.np[num_reqs_padded + 1] = num_tokens_padded
num_reqs_padded = num_reqs_padded + 1

self.query_start_loc.copy_to_gpu()
Comment on lines +537 to +545
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[0, 3] -> [0, 3, 4]


return num_reqs_padded

def _prepare_inputs(
self,
scheduler_output: "SchedulerOutput",
Expand Down Expand Up @@ -666,10 +703,6 @@ def _prepare_inputs(

self.query_start_loc.np[0] = 0
self.query_start_loc.np[1:num_reqs + 1] = cu_num_tokens
# NOTE: Due to the FIA operator limitation, here we pad so that hidden_states.shape[0]
# and self.query_start_loc[num_reqs_padded] are equal
self.query_start_loc.np[num_reqs + 1:] = (self.arange_np[1:self.max_num_reqs + 1 - num_reqs]
* self.uniform_decode_query_len + cu_num_tokens[-1])
self.query_start_loc.copy_to_gpu()

self.seq_lens.np[:num_reqs] = (
Expand Down Expand Up @@ -1153,6 +1186,7 @@ def execute_model(
scheduler_output,
num_scheduled_tokens_np,
)

num_tokens_unpadded = scheduler_output.total_num_scheduled_tokens
if self.pcp_size > 1:
num_tokens_unpadded = self.pcp_manager.total_num_sampled_tokens_pcp
Expand Down Expand Up @@ -1207,6 +1241,11 @@ def execute_model(
use_spec_decode = len(scheduler_output.scheduled_spec_decode_tokens) > 0
ubatch_slices_attn = ubatch_slices_padded if pad_attn else ubatch_slices

if cudagraph_mode != CUDAGraphMode.NONE:
num_reqs_padded = self._pad_query_start_loc_for_fia(
num_tokens_padded, num_reqs_padded, num_reqs
)

Comment on lines +1244 to +1248
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe current_platform.post_process_after_padding

(attn_metadata, spec_decode_common_attn_metadata) = (
self._build_attention_metadata(
num_tokens=num_tokens_unpadded,
Expand Down Expand Up @@ -1341,7 +1380,6 @@ def execute_model(
assert broadcasted is not None
logits = broadcasted["logits"]


# Apply structured output bitmasks if present
self.execute_model_state = ExecuteModelState(
scheduler_output,
Expand Down Expand Up @@ -1941,6 +1979,13 @@ def _get_block_table_and_slot_mapping(kv_cache_gid: int):
long_seq_metdadata = _get_pcp_metadata(num_tokens)
block_table_gid_0, slot_mapping_gid_0 = _get_block_table_and_slot_mapping(0)

actual_last_loc = self.query_start_loc.np[num_reqs_padded]
error_msg = (
f"Due to FIA kernel constraints, when the layout is TND, "
f"the first dimension of `hidden_states` ({num_tokens_padded}) "
f"must equal the last element of `actual_seq_lengths_q` ({actual_last_loc})."
)
assert self.query_start_loc.np[num_reqs_padded] == num_tokens_padded, error_msg
cm_base = AscendCommonAttentionMetadata(
query_start_loc=self.query_start_loc.gpu[: num_reqs_padded + 1],
query_start_loc_cpu=self.query_start_loc.cpu[: num_reqs_padded + 1],
Expand Down Expand Up @@ -2193,9 +2238,15 @@ def _dummy_run(
self.seq_lens.np[:num_reqs_padded] = seq_lens
self.seq_lens.np[num_reqs_padded:] = 0
self.seq_lens.copy_to_gpu()

cum_num_tokens, _ = self._get_cumsum_and_arange(num_scheduled_tokens)
self.query_start_loc.np[1 : num_reqs_padded + 1] = cum_num_tokens
self.query_start_loc.copy_to_gpu()

num_reqs_padded = self._pad_query_start_loc_for_fia(
num_tokens_padded, num_reqs_padded, num_reqs
)

pad_attn = cudagraph_runtime_mode == CUDAGraphMode.FULL
attn_metadata, _ = self._build_attention_metadata(
num_tokens=num_tokens_unpadded,
Expand Down