Skip to content

Commit 4c1e1ff

Browse files
authored
Merge branch 'vllm-project:main' into main
2 parents ea73842 + 5b0a6bc commit 4c1e1ff

File tree

3 files changed

+45
-97
lines changed

3 files changed

+45
-97
lines changed

.gemini/styleguide.md

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,14 @@
11
# Pull Request Summary Style Guide
22

3+
## Output Instructions
4+
5+
**IMPORTANT**: When doing PR review, you MUST output them in markdown code blocks so users can easily copy them:
6+
7+
1. **PR Title**: Output the generated title in a code block with triple backticks
8+
2. **PR Summary**: Output the generated summary in a markdown code block with triple backticks
9+
10+
This allows users to directly copy the content without manual formatting.
11+
312
## Pull Request Summary Format
413

514
The summary should follow the format:
@@ -37,7 +46,7 @@ The summary should also refresh the Pull Request Title to follow the format:
3746
[Branch][Module][Action] Pull Request Title
3847
```
3948

40-
- Branch: The branch name where the PR is based.
49+
- Branch: The branch name where the PR is based. If the base branch is main, this prefix can be omitted.
4150
- Module: The module or component being changed. It includes but is not limited to the following:
4251
- [Attention]
4352
- [Ops]
@@ -49,3 +58,33 @@ The summary should also refresh the Pull Request Title to follow the format:
4958
- [BugFix]
5059
- [Feature]
5160
- [Misc]
61+
62+
## Example Output Format
63+
64+
When providing a PR review, format your response like this:
65+
66+
**Suggested PR Title:**
67+
68+
```markdown
69+
[Branch][Module][Action] Your generated title here
70+
```
71+
72+
**Suggested PR Summary:**
73+
74+
```markdown
75+
### What this PR does / why we need it?
76+
77+
Your analysis of what the PR does and why it's needed.
78+
79+
Fixes #issue_number
80+
81+
### Does this PR introduce _any_ user-facing change?
82+
83+
Your assessment of user-facing changes.
84+
85+
### How was this patch tested?
86+
87+
Your description of testing approach.
88+
```
89+
90+
And please print your review suggestion in markdown format.

tests/e2e/singlecard/test_aclgraph_accuracy.py

Lines changed: 0 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -44,29 +44,6 @@
4444
],
4545
)
4646

47-
CASE_QWEN_FULL = LLMTestCase(
48-
model="Qwen/Qwen3-0.6B",
49-
prompts=PROMPTS_SHORT,
50-
golden_answers=[
51-
" Lina. I'm a 22-year-old student from China. I'm interested in studying in the US. I want to know if there are any",
52-
' the same as the president of the United Nations. This is because the president of the United States is the same as the president of the United Nations. The president',
53-
' Paris. The capital of France is also the capital of the Republic of France. The capital of France is also the capital of the European Union. The capital of',
54-
' not just a technological frontier but a profound transformation of how we live, work, and interact with the world. As we stand at the intersection of artificial intelligence and'
55-
],
56-
)
57-
58-
CASE_DS_FULL = LLMTestCase(
59-
model="vllm-ascend/DeepSeek-V2-Lite-W8A8",
60-
quantization="ascend",
61-
prompts=PROMPTS_SHORT,
62-
golden_answers=[
63-
'\nI am a 20 year old female, and I have been suffering from depression for 3 years now. I have been on medication for 2',
64-
' a man who has been in the public eye for decades. He has been a senator, a governor, and a businessman. He has also been married to the',
65-
' Paris, which is also the largest city in the country. The city is located on the River Seine and is known for its beautiful architecture, museums, and art',
66-
' here, and it’s not what you think.\nThe future of AI is here, and it’s not what you think.\nThe future of'
67-
],
68-
)
69-
7047
CASE_QWEN_FULL_DECODE_ONLY = LLMTestCase(
7148
model="Qwen/Qwen3-0.6B",
7249
prompts=PROMPTS_LONG,
@@ -117,23 +94,6 @@ def test_piecewise_res_consistency(cur_case: LLMTestCase):
11794
sampling_params=cur_case.sampling_params,
11895
golden_answers=cur_case.golden_answers)
11996

120-
@pytest.mark.parametrize(
121-
"cur_case", [CASE_QWEN_FULL, CASE_DS_FULL])
122-
def test_full_res_consistency(cur_case: LLMTestCase, monkeypatch):
123-
monkeypatch.delenv("HCCL_OP_EXPANSION_MODE", raising=False)
124-
runner_kwargs = {
125-
"model_name": cur_case.model,
126-
"max_model_len": 1024,
127-
"compilation_config": {
128-
"cudagraph_capture_sizes": [4, 8, 32, 64],
129-
"cudagraph_mode": "FULL_DECODE_ONLY"
130-
},
131-
"quantization": cur_case.quantization,
132-
}
133-
gen_and_valid(runner_kwargs=runner_kwargs,
134-
prompts=cur_case.prompts,
135-
sampling_params=cur_case.sampling_params,
136-
golden_answers=cur_case.golden_answers)
13797

13898
@pytest.mark.parametrize(
13999
"cur_case", [CASE_QWEN_FULL_DECODE_ONLY, CASE_DS_FULL_DECODE_ONLY])

vllm_ascend/worker/model_runner_v1.py

Lines changed: 5 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -206,13 +206,6 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device):
206206
vllm_config.scheduler_config.max_num_batched_tokens += max_pcp_pad_tokens
207207
with _torch_cuda_wrapper():
208208
super().__init__(vllm_config, device)
209-
210-
# NOTE: For FULL mode we change +1 to +2 to reserve extra space for padding.
211-
# See _pad_query_start_loc_for_fia.
212-
self.query_start_loc = self._make_buffer(
213-
self.max_num_reqs + 2, dtype=torch.int32 # type: ignore[has-type]
214-
)
215-
216209
vllm_config.scheduler_config.max_num_batched_tokens -= max_pcp_pad_tokens
217210
self.max_num_tokens = self.scheduler_config.max_num_batched_tokens
218211
self.max_num_reqs = self.scheduler_config.max_num_seqs
@@ -516,36 +509,6 @@ def get_model(self) -> nn.Module:
516509
return self.model.unwrap()
517510
return self.model
518511

519-
def _pad_query_start_loc_for_fia(
520-
self, num_tokens_padded: int, num_reqs_padded: int, num_reqs: int
521-
) -> int:
522-
"""
523-
This function is only designed to satisfied the constraint that when the layout is TND,
524-
the first dimension of `hidden_states` must equal the last element of `actual_seq_lengths_q`.
525-
"""
526-
527-
if num_tokens_padded == num_reqs_padded * self.uniform_decode_query_len:
528-
# Uniform-batch case: num_reqs must be no greater than num_reqs_padded
529-
assert num_reqs <= num_reqs_padded
530-
531-
last_loc = self.query_start_loc.np[num_reqs]
532-
self.query_start_loc.np[num_reqs + 1 : num_reqs_padded + 1] = (
533-
self.arange_np[1 : num_reqs_padded + 1 - num_reqs]
534-
* self.uniform_decode_query_len
535-
+ last_loc
536-
)
537-
else:
538-
# Mixed-batch case: num_reqs must equal num_reqs_padded
539-
assert num_reqs == num_reqs_padded
540-
541-
# Insert a dummy request instead of setting query_start_loc[num_reqs] = num_tokens_padded directly
542-
self.query_start_loc.np[num_reqs_padded + 1] = num_tokens_padded
543-
num_reqs_padded = num_reqs_padded + 1
544-
545-
self.query_start_loc.copy_to_gpu()
546-
547-
return num_reqs_padded
548-
549512
def _prepare_inputs(
550513
self,
551514
scheduler_output: "SchedulerOutput",
@@ -707,6 +670,10 @@ def _prepare_inputs(
707670

708671
self.query_start_loc.np[0] = 0
709672
self.query_start_loc.np[1:num_reqs + 1] = cu_num_tokens
673+
# NOTE: Due to the FIA operator limitation, here we pad so that hidden_states.shape[0]
674+
# and self.query_start_loc[num_reqs_padded] are equal
675+
self.query_start_loc.np[num_reqs + 1:] = (self.arange_np[1:self.max_num_reqs + 1 - num_reqs]
676+
* self.uniform_decode_query_len + cu_num_tokens[-1])
710677
self.query_start_loc.copy_to_gpu()
711678

712679
self.seq_lens.np[:num_reqs] = (
@@ -1188,7 +1155,6 @@ def execute_model(
11881155
scheduler_output,
11891156
num_scheduled_tokens_np,
11901157
)
1191-
11921158
num_tokens_unpadded = scheduler_output.total_num_scheduled_tokens
11931159
if self.pcp_size > 1:
11941160
num_tokens_unpadded = self.pcp_manager.total_num_sampled_tokens_pcp
@@ -1243,11 +1209,6 @@ def execute_model(
12431209
use_spec_decode = len(scheduler_output.scheduled_spec_decode_tokens) > 0
12441210
ubatch_slices_attn = ubatch_slices_padded if pad_attn else ubatch_slices
12451211

1246-
if cudagraph_mode != CUDAGraphMode.NONE:
1247-
num_reqs_padded = self._pad_query_start_loc_for_fia(
1248-
num_tokens_padded, num_reqs_padded, num_reqs
1249-
)
1250-
12511212
(attn_metadata, spec_decode_common_attn_metadata) = (
12521213
self._build_attention_metadata(
12531214
num_tokens=num_tokens_unpadded,
@@ -1382,6 +1343,7 @@ def execute_model(
13821343
assert broadcasted is not None
13831344
logits = broadcasted["logits"]
13841345

1346+
13851347
# Apply structured output bitmasks if present
13861348
self.execute_model_state = ExecuteModelState(
13871349
scheduler_output,
@@ -1981,13 +1943,6 @@ def _get_block_table_and_slot_mapping(kv_cache_gid: int):
19811943
self.long_seq_metadata = _get_pcp_metadata(num_tokens)
19821944
block_table_gid_0, slot_mapping_gid_0 = _get_block_table_and_slot_mapping(0)
19831945

1984-
actual_last_loc = self.query_start_loc.np[num_reqs_padded]
1985-
error_msg = (
1986-
f"Due to FIA kernel constraints, when the layout is TND, "
1987-
f"the first dimension of `hidden_states` ({num_tokens_padded}) "
1988-
f"must equal the last element of `actual_seq_lengths_q` ({actual_last_loc})."
1989-
)
1990-
assert self.query_start_loc.np[num_reqs_padded] == num_tokens_padded, error_msg
19911946
cm_base = AscendCommonAttentionMetadata(
19921947
query_start_loc=self.query_start_loc.gpu[: num_reqs_padded + 1],
19931948
query_start_loc_cpu=self.query_start_loc.cpu[: num_reqs_padded + 1],
@@ -2245,15 +2200,9 @@ def _dummy_run(
22452200
self.seq_lens.np[:num_reqs_padded] = seq_lens
22462201
self.seq_lens.np[num_reqs_padded:] = 0
22472202
self.seq_lens.copy_to_gpu()
2248-
22492203
cum_num_tokens, _ = self._get_cumsum_and_arange(num_scheduled_tokens)
22502204
self.query_start_loc.np[1 : num_reqs_padded + 1] = cum_num_tokens
22512205
self.query_start_loc.copy_to_gpu()
2252-
2253-
num_reqs_padded = self._pad_query_start_loc_for_fia(
2254-
num_tokens_padded, num_reqs_padded, num_reqs
2255-
)
2256-
22572206
pad_attn = cudagraph_runtime_mode == CUDAGraphMode.FULL
22582207
attn_metadata, _ = self._build_attention_metadata(
22592208
num_tokens=num_tokens_unpadded,

0 commit comments

Comments
 (0)