@@ -252,29 +252,6 @@ def free(self, request: Request) -> None:
252
252
if block .ref_cnt == 0 :
253
253
self .free_block_queue .append (block )
254
254
255
- def uncache_blocks (self , request : Request ) -> int :
256
- """Uncache the blocks that are no longer full based on the
257
- num_computed_tokens in the given request. This happens when
258
- the blocks were full and cached due to speculative tokens, but the
259
- speculative tokens are not accepted.
260
-
261
- Args:
262
- request: The request.
263
-
264
- Returns:
265
- The number of uncached blocks.
266
- """
267
- blocks = self .req_to_blocks [request .request_id ]
268
- num_computed_tokens = request .num_computed_tokens
269
- num_full_blocks = num_computed_tokens // self .block_size
270
- num_uncached_blocks = 0
271
- for block in blocks [num_full_blocks :]:
272
- # If the block is not cached, the following blocks are not cached.
273
- if not self ._maybe_evict_cached_block (block ):
274
- break
275
- num_uncached_blocks += 1
276
- return num_uncached_blocks
277
-
278
255
def reset_prefix_cache (self ) -> bool :
279
256
"""Reset prefix cache. This function may be used in RLHF
280
257
flows to invalid prefix caching after the weights are updated,
@@ -470,8 +447,22 @@ def _cache_full_blocks(
470
447
assert prev_block .block_hash is not None
471
448
prev_block_hash_value = prev_block .block_hash .hash_value
472
449
473
- for i , blk in enumerate (full_blocks ):
474
- blk_idx = blk_start_idx + i
450
+ # Find the first uncached block. This case should only happen when
451
+ # speculative decoding is used.
452
+ offset = 0
453
+ for blk in full_blocks :
454
+ if blk .block_hash is None :
455
+ break
456
+ else :
457
+ prev_block_hash_value = blk .block_hash .hash_value
458
+ offset += 1
459
+ else :
460
+ # All blocks are cached.
461
+ return
462
+
463
+ for i , blk in enumerate (full_blocks [offset :]):
464
+ blk_idx = blk_start_idx + offset + i
465
+ assert blk .block_hash is None
475
466
476
467
if blk_idx < num_cached_block_hashes :
477
468
# The block hash may already be computed in
0 commit comments