From 91b8a3bda06bf8e32dcbb1a554b49e1ee3a2dfc5 Mon Sep 17 00:00:00 2001 From: Stella Laurenzo Date: Wed, 19 Feb 2025 07:24:34 -0800 Subject: [PATCH] Put dealloca on timeline --- shortfin/src/shortfin/array/storage.cc | 5 +--- shortfin/src/shortfin/local/scheduler.cc | 29 ++++++++++++++++++------ shortfin/src/shortfin/local/scheduler.h | 2 ++ 3 files changed, 25 insertions(+), 11 deletions(-) diff --git a/shortfin/src/shortfin/array/storage.cc b/shortfin/src/shortfin/array/storage.cc index 3339d69f1..e936148f8 100644 --- a/shortfin/src/shortfin/array/storage.cc +++ b/shortfin/src/shortfin/array/storage.cc @@ -77,22 +77,19 @@ storage storage::allocate_device(ScopedDevice &device, IREE_HAL_ALLOCATOR_POOL_DEFAULT, params, allocation_size, buffer.for_output())); SHORTFIN_SCHED_LOG( - "storage::allocate_device(device={}, affinity={:x}):[{}, Wait@{}:" + "storage::allocate_device(device={}, affinity={:x}):[{}, Wait@{}->" "Signal:@{}] -> buffer={}", static_cast(device.raw_device()->hal_device()), device.affinity().queue_affinity(), static_cast(timeline_sem), current_timepoint, signal_timepoint, static_cast(buffer.get())); // Device allocations are always async. - // TODO: DO NOT SUBMIT: Enable async destruction. TimelineResourceDestructor dtor = TimelineResource::CreateAsyncBufferDestructor(device, buffer); auto resource = device.fiber().NewTimelineResource(std::move(dtor)); resource->set_mutation_barrier(timeline_sem, signal_timepoint); resource->use_barrier_insert(timeline_sem, signal_timepoint); return storage(device, std::move(buffer), std::move(resource)); - // return storage(device, std::move(buffer), - // device.fiber().NewTimelineResource()); } storage storage::allocate_host(ScopedDevice &device, diff --git a/shortfin/src/shortfin/local/scheduler.cc b/shortfin/src/shortfin/local/scheduler.cc index 7240a6db2..b3d435628 100644 --- a/shortfin/src/shortfin/local/scheduler.cc +++ b/shortfin/src/shortfin/local/scheduler.cc @@ -111,25 +111,40 @@ TimelineResource::~TimelineResource() { TimelineResourceDestructor TimelineResource::CreateAsyncBufferDestructor( ScopedDevice &scoped_device, iree::hal_buffer_ptr buffer) { - return [device = iree::hal_device_ptr::borrow_reference( - scoped_device.raw_device()->hal_device()), - affinity = scoped_device.affinity().queue_affinity(), + // The ScopedDevice doesn't lifetime extend the underlying hal device, so + // we must do that manually across the callback (and then release at the end). + iree_hal_device_retain(scoped_device.raw_device()->hal_device()); + return [device_affinity = scoped_device.affinity(), buffer = std::move(buffer)](TimelineResource &res) { + ScopedDevice scoped_device(*res.fiber(), device_affinity); + iree_hal_device_t *hal_device = scoped_device.raw_device()->hal_device(); + auto queue_affinity = scoped_device.affinity().queue_affinity(); SHORTFIN_TRACE_SCOPE_NAMED("TimelineResource::AsyncBufferDestructor"); iree_hal_semaphore_list_t wait_semaphore_list = res.use_barrier(); - iree_hal_semaphore_list_t signal_semaphore_list = - iree_hal_semaphore_list_empty(); + // TODO: If desiring strict memory reclamation ordering, we want to queue + // order the successors to the deallocation. + auto fiber = res.fiber(); + auto &account = fiber->scheduler().GetDefaultAccount(scoped_device); + iree_hal_semaphore_t *timeline_sem = account.timeline_sem(); + uint64_t signal_timepoint = account.timeline_acquire_timepoint(); + iree_hal_semaphore_list_t signal_semaphore_list{ + .count = 1, + .semaphores = &timeline_sem, + .payload_values = &signal_timepoint, + }; if (SHORTFIN_SCHED_LOG_ENABLED) { auto wait_sum = iree::DebugPrintSemaphoreList(wait_semaphore_list); auto signal_sum = iree::DebugPrintSemaphoreList(signal_semaphore_list); SHORTFIN_SCHED_LOG( "async dealloca(device={}, affinity={:x}, buffer={}):[Wait:{}, " "Signal:{}]", - static_cast(device.get()), affinity, + static_cast(hal_device), queue_affinity, static_cast(buffer.get()), wait_sum, signal_sum); } SHORTFIN_THROW_IF_ERROR(iree_hal_device_queue_dealloca( - device, affinity, wait_semaphore_list, signal_semaphore_list, buffer)); + hal_device, queue_affinity, wait_semaphore_list, signal_semaphore_list, + buffer)); + iree_hal_device_release(hal_device); }; } diff --git a/shortfin/src/shortfin/local/scheduler.h b/shortfin/src/shortfin/local/scheduler.h index eb7e7a3c9..17881955b 100644 --- a/shortfin/src/shortfin/local/scheduler.h +++ b/shortfin/src/shortfin/local/scheduler.h @@ -169,6 +169,8 @@ class SHORTFIN_API TimelineResource { if (--refcnt_ == 0) delete this; } + Fiber *fiber() { return fiber_.get(); } + private: TimelineResource(std::shared_ptr fiber, size_t semaphore_capacity, TimelineResourceDestructor destructor);