Skip to content

Commit

Permalink
fix
Browse files Browse the repository at this point in the history
  • Loading branch information
yma11 committed Jan 31, 2025
1 parent bfc49fd commit 6bd06d7
Showing 1 changed file with 4 additions and 2 deletions.
6 changes: 4 additions & 2 deletions vllm/worker/hpu_enc_dec_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,7 +361,8 @@ def profile_run(self) -> None:
# Enable top-k sampling to reflect the accurate memory usage.
sampling_params = SamplingParams(top_p=0.99, top_k=self.vocab_size - 1)
max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens
max_num_seqs = self.scheduler_config.max_num_seqs
# Workaround to avoid unexpeced OOM failure during profile run
max_num_seqs = int(self.scheduler_config.max_num_seqs/2)

# Profile memory usage with max_num_sequences sequences and the total
# number of tokens equal to max_num_batched_tokens.
Expand Down Expand Up @@ -432,7 +433,8 @@ def profile_run(self) -> None:
seqs, finished_requests_ids=finished_requests_ids)
intermediate_tensors = None
self.execute_model(model_input, kv_caches, intermediate_tensors)
torch.cuda.synchronize()
torch.hpu.synchronize()
gc.collect()
return

def trim_attn_metadata(self, metadata: AttentionMetadata) -> object:
Expand Down

0 comments on commit 6bd06d7

Please sign in to comment.