Skip to content

Commit 5095e96

Browse files
authored
[V1] Revert uncache_blocks and support recaching full blocks (#12415)
Signed-off-by: Cody Yu <[email protected]>
1 parent cf58b9c commit 5095e96

File tree

2 files changed

+16
-55
lines changed

2 files changed

+16
-55
lines changed

tests/v1/core/test_prefix_caching.py

-30
Original file line numberDiff line numberDiff line change
@@ -629,33 +629,3 @@ def test_reset_prefix_cache():
629629
assert manager.reset_prefix_cache()
630630
assert not manager.cached_block_hash_to_block
631631
assert all([blk.block_hash is None for blk in manager.block_pool])
632-
633-
634-
def test_uncache_blocks():
635-
manager = KVCacheManager(
636-
block_size=16,
637-
num_gpu_blocks=10,
638-
max_model_len=8192,
639-
sliding_window=None,
640-
enable_caching=True,
641-
num_preallocate_tokens=0,
642-
)
643-
644-
req0 = make_request("0", list(range(30)))
645-
blocks = manager.allocate_slots(req0, 30)
646-
assert [b.block_id for b in blocks] == [0, 1]
647-
assert len(manager.cached_block_hash_to_block) == 1
648-
649-
req0.num_computed_tokens = 30
650-
651-
# Simulate speculative tokens.
652-
for _ in range(5):
653-
req0.append_output_token_ids(8)
654-
manager.allocate_slots(req0, 5)
655-
assert len(manager.cached_block_hash_to_block) == 2
656-
657-
# After sampling, assuming only 1 token is accepted.
658-
req0.num_computed_tokens = 31
659-
num_uncached_blocks = manager.uncache_blocks(req0)
660-
assert num_uncached_blocks == 1
661-
assert len(manager.cached_block_hash_to_block) == 1

vllm/v1/core/kv_cache_manager.py

+16-25
Original file line numberDiff line numberDiff line change
@@ -252,29 +252,6 @@ def free(self, request: Request) -> None:
252252
if block.ref_cnt == 0:
253253
self.free_block_queue.append(block)
254254

255-
def uncache_blocks(self, request: Request) -> int:
256-
"""Uncache the blocks that are no longer full based on the
257-
num_computed_tokens in the given request. This happens when
258-
the blocks were full and cached due to speculative tokens, but the
259-
speculative tokens are not accepted.
260-
261-
Args:
262-
request: The request.
263-
264-
Returns:
265-
The number of uncached blocks.
266-
"""
267-
blocks = self.req_to_blocks[request.request_id]
268-
num_computed_tokens = request.num_computed_tokens
269-
num_full_blocks = num_computed_tokens // self.block_size
270-
num_uncached_blocks = 0
271-
for block in blocks[num_full_blocks:]:
272-
# If the block is not cached, the following blocks are not cached.
273-
if not self._maybe_evict_cached_block(block):
274-
break
275-
num_uncached_blocks += 1
276-
return num_uncached_blocks
277-
278255
def reset_prefix_cache(self) -> bool:
279256
"""Reset prefix cache. This function may be used in RLHF
280257
flows to invalid prefix caching after the weights are updated,
@@ -470,8 +447,22 @@ def _cache_full_blocks(
470447
assert prev_block.block_hash is not None
471448
prev_block_hash_value = prev_block.block_hash.hash_value
472449

473-
for i, blk in enumerate(full_blocks):
474-
blk_idx = blk_start_idx + i
450+
# Find the first uncached block. This case should only happen when
451+
# speculative decoding is used.
452+
offset = 0
453+
for blk in full_blocks:
454+
if blk.block_hash is None:
455+
break
456+
else:
457+
prev_block_hash_value = blk.block_hash.hash_value
458+
offset += 1
459+
else:
460+
# All blocks are cached.
461+
return
462+
463+
for i, blk in enumerate(full_blocks[offset:]):
464+
blk_idx = blk_start_idx + offset + i
465+
assert blk.block_hash is None
475466

476467
if blk_idx < num_cached_block_hashes:
477468
# The block hash may already be computed in

0 commit comments

Comments
 (0)