From b299af379b88391b9265f5c70a669c49ee517d3c Mon Sep 17 00:00:00 2001 From: Stella Laurenzo Date: Wed, 19 Feb 2025 09:38:54 -0800 Subject: [PATCH] [shortfin] Implement async alloc/dealloc of buffers. (#507) * Device allocations are now async, queue ordered alloc/dealloc. * Program invocations asynchronously deallocate function call results if it can. If it ever cannot, then a small tracy zone `SyncImportTimelineResource` will be emitted per result that cannot be async deallocated. * Adds `ProgramInvocation.assume_no_alias` instance boolean to disable the assumption which allows async deallocation to work. * Adds global `ProgramIncovation.global_no_alias` property to control process-wide. This is a very fiddly optimization which requires (esp in multi-device cases) a number of things to line up. Tested on amdgpu and CPU with a number of sample workloads (with logging enabled and visually confirmed). See #980 for detailed analysis and further work required. --- shortfin/python/lib_ext.cc | 176 ++++++++++------ shortfin/src/shortfin/array/array.cc | 10 +- shortfin/src/shortfin/array/array.h | 4 +- shortfin/src/shortfin/array/storage.cc | 93 ++++++--- shortfin/src/shortfin/array/storage.h | 19 +- shortfin/src/shortfin/local/fiber.h | 5 +- shortfin/src/shortfin/local/program.cc | 190 +++++++++++++++++- shortfin/src/shortfin/local/program.h | 65 +++++- .../src/shortfin/local/program_interfaces.h | 10 +- shortfin/src/shortfin/local/scheduler.cc | 54 ++++- shortfin/src/shortfin/local/scheduler.h | 46 ++++- shortfin/src/shortfin/support/iree_helpers.cc | 10 + shortfin/src/shortfin/support/iree_helpers.h | 6 + .../invocation/mobilenet_program_test.py | 15 +- 14 files changed, 569 insertions(+), 134 deletions(-) diff --git a/shortfin/python/lib_ext.cc b/shortfin/python/lib_ext.cc index af7aba767..adfccf906 100644 --- a/shortfin/python/lib_ext.cc +++ b/shortfin/python/lib_ext.cc @@ -316,32 +316,97 @@ local::ProgramInvocation::Future PyFunctionCall( return local::ProgramInvocation::Invoke(std::move(inv)); } -py::object PyRehydrateRef(local::ProgramInvocation *inv, - iree::vm_opaque_ref ref) { - auto type = ref.get()->type; - // Note that these accessors are dangerous as they assert/abort if - // process-wide registration is not done properly. We assume here that - // since we got a ref out that the basics are set up soundly, but if actually - // doing this on user/dynamic types, we would want to be more defensive. - // TODO: Don't just do a linear scan if we have more than a couple. - // TODO: Find a reliable way to statically cache the type id. - if (local::ProgramInvocationMarshalableFactory::invocation_marshalable_type< - array::device_array>() == type) { - // device_array - return py::cast(local::ProgramInvocationMarshalableFactory:: - CreateFromInvocationResultRef( - inv, std::move(ref))); - } else if (local::ProgramInvocationMarshalableFactory:: - invocation_marshalable_type() == type) { - // storage - return py::cast( - local::ProgramInvocationMarshalableFactory:: - CreateFromInvocationResultRef(inv, std::move(ref))); +// Wraps a ProgramInvocation::Ptr representing a completed (awaited) invocation. +// Holds some additional accounting for marshaling results back to Python. +class PyProgramInvocation { + public: + PyProgramInvocation(local::ProgramInvocation::Ptr inv) + : inv_(std::move(inv)) {} + PyProgramInvocation(const PyProgramInvocation &) = delete; + PyProgramInvocation(PyProgramInvocation &&other) + : inv_(std::move(other.inv_)), + cached_results_(std::move(other.cached_results_)), + results_failure_(other.results_failure_) {} + + // Fields that can be bound. + bool assume_no_alias = true; + static std::optional global_assume_no_alias; + + void CheckValid() { + if (!inv_) throw std::invalid_argument("Deallocated invocation"); } - throw std::invalid_argument( - fmt::format("Cannot marshal ref type {} to Python", - to_string_view(iree_vm_ref_type_name(type)))); -} + local::ProgramInvocation::Ptr &inv() { return inv_; } + + py::object results() { + if (results_failure_) { + throw std::logic_error("Prior attempt to marshal IREE results failed"); + } + if (cached_results_) { + return cached_results_; + } + + // Cache results. + CheckValid(); + results_failure_ = true; + + local::CoarseInvocationTimelineImporter::Options options; + options.assume_no_alias = assume_no_alias; + if (global_assume_no_alias) { + options.assume_no_alias = *global_assume_no_alias; + } + local::CoarseInvocationTimelineImporter timeline_importer(inv().get(), + options); + size_t size = inv_->results_size(); + py::object tp = py::steal(PyTuple_New(size)); + for (size_t i = 0; i < size; ++i) { + iree::vm_opaque_ref ref = inv_->result_ref(i); + if (!ref) { + throw new std::logic_error("Program returned unsupported Python type"); + } + py::object item = RehydrateRef(std::move(ref), &timeline_importer); + PyTuple_SET_ITEM(tp.ptr(), i, item.release().ptr()); + } + + cached_results_ = std::move(tp); + results_failure_ = false; + return cached_results_; + } + + private: + py::object RehydrateRef( + iree::vm_opaque_ref ref, + local::CoarseInvocationTimelineImporter *timeline_importer) { + auto type = ref.get()->type; + // Note that these accessors are dangerous as they assert/abort if + // process-wide registration is not done properly. We assume here that + // since we got a ref out that the basics are set up soundly, but if + // actually doing this on user/dynamic types, we would want to be more + // defensive. + // TODO: Don't just do a linear scan if we have more than a couple. + // TODO: Find a reliable way to statically cache the type id. + if (local::ProgramInvocationMarshalableFactory::invocation_marshalable_type< + array::device_array>() == type) { + // device_array + return py::cast(local::ProgramInvocationMarshalableFactory:: + CreateFromInvocationResultRef( + inv().get(), timeline_importer, std::move(ref))); + } else if (local::ProgramInvocationMarshalableFactory:: + invocation_marshalable_type() == type) { + // storage + return py::cast(local::ProgramInvocationMarshalableFactory:: + CreateFromInvocationResultRef( + inv().get(), timeline_importer, std::move(ref))); + } + throw std::invalid_argument( + fmt::format("Cannot marshal ref type {} to Python", + to_string_view(iree_vm_ref_type_name(type)))); + } + + local::ProgramInvocation::Ptr inv_; + py::object cached_results_; + bool results_failure_ = false; +}; +std::optional PyProgramInvocation::global_assume_no_alias; py::object RunInForeground(std::shared_ptr refs, local::System &self, py::object coro) { @@ -743,56 +808,45 @@ void BindLocal(py::module_ &m) { return local::ProgramModule::ParameterProvider(system, c_params); }, py::arg("system"), py::arg("params")); - py::class_(m, "ProgramInvocation") + py::class_(m, "ProgramInvocation") + .def_rw("assume_no_alias", &PyProgramInvocation::assume_no_alias, + "Assumes that no results alias inputs or other buffers") + .def_rw_static( + "global_assume_no_alias", + &PyProgramInvocation::global_assume_no_alias, + "Globally changes the assume_no_alias flag for all invocations") .def("invoke", - [](local::ProgramInvocation::Ptr &self) { - if (!self) throw std::invalid_argument("Deallocated invocation"); - return local::ProgramInvocation::Invoke(std::move(self)); + [](PyProgramInvocation &self) { + self.CheckValid(); + return local::ProgramInvocation::Invoke(std::move(self.inv())); }) .def("add_arg", - [](local::ProgramInvocation::Ptr &self, py::handle arg) { - if (!self) throw std::invalid_argument("Deallocated invocation"); - py::capsule inv_capsule(self.get()); + [](PyProgramInvocation &self, py::handle arg) { + self.CheckValid(); + py::capsule inv_capsule(&self.inv()); PyAddProgramInvocationArg(inv_capsule, arg); }) .def("__iter__", - [](local::ProgramInvocation::Ptr &self) { - if (!self) throw std::invalid_argument("Deallocated invocation"); - size_t size = self->results_size(); - py::object tp = py::steal(PyTuple_New(size)); - for (size_t i = 0; i < size; ++i) { - iree::vm_opaque_ref ref = self->result_ref(i); - if (!ref) { - throw new std::logic_error( - "Program returned unsupported Python type"); - } - py::object item = PyRehydrateRef(self.get(), std::move(ref)); - PyTuple_SET_ITEM(tp.ptr(), i, item.release().ptr()); - } - return tp.attr("__iter__")(); + [](PyProgramInvocation &self) { + return self.results().attr("__iter__")(); }) .def( "__len__", - [](local::ProgramInvocation::Ptr &self) { - if (!self) throw std::invalid_argument("Deallocated invocation"); - return self->results_size(); + [](PyProgramInvocation &self) { + self.CheckValid(); + return self.inv()->results_size(); }, "The number of results in this invocation") .def( "__getitem__", - [](local::ProgramInvocation::Ptr &self, iree_host_size_t i) { - if (!self) throw std::invalid_argument("Deallocated invocation"); - iree::vm_opaque_ref ref = self->result_ref(i); - if (!ref) { - throw new std::logic_error( - "Program returned unsupported Python type"); - } - return PyRehydrateRef(self.get(), std::move(ref)); + [](PyProgramInvocation &self, iree_host_size_t i) { + self.CheckValid(); + return self.results().attr("__getitem__")(i); }, "Gets the i'th result") - .def("__repr__", [](local::ProgramInvocation::Ptr &self) { - if (!self) return std::string("ProgramInvocation(INVALID)"); - return self->to_s(); + .def("__repr__", [](PyProgramInvocation &self) { + if (!self.inv()) return std::string("ProgramInvocation(INVALID)"); + return self.inv()->to_s(); }); py::class_(m, "BaseProgramParameters"); @@ -1207,7 +1261,7 @@ void BindLocal(py::module_ &m) { // expensive in the C++ API: essentially, ProgramInvocations flow // through the system precisely one way. As a low level facility, this // is deemed acceptable. - return py::cast(std::move(result)); + return py::cast(PyProgramInvocation(std::move(result))); }); py::class_(m, "MessageFuture") .def("result", [](local::MessageFuture &self) { diff --git a/shortfin/src/shortfin/array/array.cc b/shortfin/src/shortfin/array/array.cc index c0eca52d0..7b235fd11 100644 --- a/shortfin/src/shortfin/array/array.cc +++ b/shortfin/src/shortfin/array/array.cc @@ -109,7 +109,7 @@ void device_array::AddAsInvocationArgument( iree::vm_opaque_ref ref; *(&ref) = iree_hal_buffer_view_move_ref(buffer_view); - inv->AddArg(std::move(ref)); + inv->AddArg(std::move(ref), storage().timeline_resource_.get()); storage().AddInvocationArgBarrier(inv, barrier); } @@ -119,7 +119,9 @@ iree_vm_ref_type_t device_array::invocation_marshalable_type() { } device_array device_array::CreateFromInvocationResultRef( - local::ProgramInvocation *inv, iree::vm_opaque_ref ref) { + local::ProgramInvocation *inv, + local::CoarseInvocationTimelineImporter *timeline_importer, + iree::vm_opaque_ref ref) { SHORTFIN_TRACE_SCOPE_NAMED("PyDeviceArray::CreateFromInvocationResultRef"); // We don't retain the buffer view in the device array, so just deref it // vs stealing the ref. @@ -127,8 +129,8 @@ device_array device_array::CreateFromInvocationResultRef( iree::hal_buffer_ptr buffer = iree::hal_buffer_ptr::borrow_reference(iree_hal_buffer_view_buffer(bv)); - auto imported_storage = - storage::ImportInvocationResultStorage(inv, std::move(buffer)); + auto imported_storage = storage::ImportInvocationResultStorage( + inv, timeline_importer, std::move(buffer)); std::span shape(iree_hal_buffer_view_shape_dims(bv), iree_hal_buffer_view_shape_rank(bv)); return device_array( diff --git a/shortfin/src/shortfin/array/array.h b/shortfin/src/shortfin/array/array.h index 8af584849..6cee0bfd1 100644 --- a/shortfin/src/shortfin/array/array.h +++ b/shortfin/src/shortfin/array/array.h @@ -216,7 +216,9 @@ class SHORTFIN_API device_array void AddAsInvocationArgument(local::ProgramInvocation *inv, local::ProgramResourceBarrier barrier) override; static device_array CreateFromInvocationResultRef( - local::ProgramInvocation *inv, iree::vm_opaque_ref ref); + local::ProgramInvocation *inv, + local::CoarseInvocationTimelineImporter *timeline_importer, + iree::vm_opaque_ref ref); static iree_vm_ref_type_t invocation_marshalable_type(); friend class shortfin::local::ProgramInvocationMarshalableFactory; }; diff --git a/shortfin/src/shortfin/array/storage.cc b/shortfin/src/shortfin/array/storage.cc index a36991d1b..e936148f8 100644 --- a/shortfin/src/shortfin/array/storage.cc +++ b/shortfin/src/shortfin/array/storage.cc @@ -33,12 +33,14 @@ storage::storage(local::ScopedDevice device, iree::hal_buffer_ptr buffer, device_(device) { logging::construct("array::storage", this); } -storage::~storage() { logging::destruct("array::storage", this); } - -storage storage::import_buffer(local::ScopedDevice &device, - iree::hal_buffer_ptr buffer) { - return storage(device, std::move(buffer), - device.fiber().NewTimelineResource()); +storage::~storage() { + logging::destruct("array::storage", this); + SHORTFIN_TRACE_SCOPE_NAMED("storage::~storage"); + // The timeline resource holds the back reference to the owning fiber, + // which keeps all devices alive. Buffers must be destroyed before devices, + // so destruction sequencing matters and we make it explicit. + buffer_.reset(); + timeline_resource_.reset(); } storage storage::allocate_device(ScopedDevice &device, @@ -47,7 +49,6 @@ storage storage::allocate_device(ScopedDevice &device, if (!device.raw_device()) { throw std::invalid_argument("Cannot allocate with a null device affinity"); } - auto allocator = iree_hal_device_allocator(device.raw_device()->hal_device()); iree::hal_buffer_ptr buffer; iree_hal_buffer_params_t params = { .usage = IREE_HAL_BUFFER_USAGE_DEFAULT, @@ -55,10 +56,40 @@ storage storage::allocate_device(ScopedDevice &device, .type = IREE_HAL_MEMORY_TYPE_OPTIMAL_FOR_DEVICE, .queue_affinity = device.affinity().queue_affinity(), }; - SHORTFIN_THROW_IF_ERROR(iree_hal_allocator_allocate_buffer( - allocator, params, allocation_size, buffer.for_output())); - return storage(device, std::move(buffer), - device.fiber().NewTimelineResource()); + Account &account = device.fiber().scheduler().GetDefaultAccount(device); + iree_hal_semaphore_t *timeline_sem = account.timeline_sem(); + uint64_t current_timepoint = account.timeline_idle_timepoint(); + uint64_t signal_timepoint = account.timeline_acquire_timepoint(); + iree_hal_semaphore_list_t wait_semaphore_list{ + .count = 1, + .semaphores = &timeline_sem, + .payload_values = ¤t_timepoint, + }; + iree_hal_semaphore_list_t signal_semaphore_list{ + .count = 1, + .semaphores = &timeline_sem, + .payload_values = &signal_timepoint, + }; + // Async allocate. + SHORTFIN_THROW_IF_ERROR(iree_hal_device_queue_alloca( + device.raw_device()->hal_device(), device.affinity().queue_affinity(), + wait_semaphore_list, signal_semaphore_list, + IREE_HAL_ALLOCATOR_POOL_DEFAULT, params, allocation_size, + buffer.for_output())); + SHORTFIN_SCHED_LOG( + "storage::allocate_device(device={}, affinity={:x}):[{}, Wait@{}->" + "Signal:@{}] -> buffer={}", + static_cast(device.raw_device()->hal_device()), + device.affinity().queue_affinity(), static_cast(timeline_sem), + current_timepoint, signal_timepoint, static_cast(buffer.get())); + + // Device allocations are always async. + TimelineResourceDestructor dtor = + TimelineResource::CreateAsyncBufferDestructor(device, buffer); + auto resource = device.fiber().NewTimelineResource(std::move(dtor)); + resource->set_mutation_barrier(timeline_sem, signal_timepoint); + resource->use_barrier_insert(timeline_sem, signal_timepoint); + return storage(device, std::move(buffer), std::move(resource)); } storage storage::allocate_host(ScopedDevice &device, @@ -101,11 +132,9 @@ void storage::fill(const void *pattern, iree_host_size_t pattern_length) { device_.fiber().scheduler().AppendCommandBuffer( device_, TransactionType::TRANSFER, [&](Account &account) { // Must depend on all of this buffer's use dependencies to avoid - // write-after-read hazard. + // write-after-read hazard (which implicitly includes + // write-after-write). account.active_deps_extend(timeline_resource_->use_barrier()); - // And depend on any prior mutation in order to avoid a - // write-after-write hazard. - account.active_deps_extend(timeline_resource_->mutation_barrier()); SHORTFIN_SCHED_LOG(" : FillBuffer({})", static_cast(buffer_.get())); @@ -116,10 +145,12 @@ void storage::fill(const void *pattern, iree_host_size_t pattern_length) { /*length=*/iree_hal_buffer_byte_length(buffer_)), pattern, pattern_length, IREE_HAL_FILL_FLAG_NONE)); - // And move our own mutation barrier to the current pending timeline - // value. + // And move our own use and mutation barrier to the current pending + // timeline value. timeline_resource_->set_mutation_barrier( account.timeline_sem(), account.timeline_idle_timepoint()); + timeline_resource_->use_barrier_insert( + account.timeline_sem(), account.timeline_idle_timepoint()); }); } @@ -132,7 +163,6 @@ void storage::copy_from(storage &source_storage) { source_storage.timeline_resource_->mutation_barrier()); // And depend on our own use and mutations dependencies. account.active_deps_extend(timeline_resource_->use_barrier()); - account.active_deps_extend(timeline_resource_->mutation_barrier()); SHORTFIN_SCHED_LOG(" : CopyBuffer({} -> {})", static_cast(source_storage.buffer_.get()), @@ -145,10 +175,12 @@ void storage::copy_from(storage &source_storage) { iree_hal_make_buffer_ref(buffer_, 0, byte_length()), IREE_HAL_COPY_FLAG_NONE)); - // Move our own mutation barrier to the current pending timeline + // Move our own use and mutation barrier to the current pending timeline // value. timeline_resource_->set_mutation_barrier( account.timeline_sem(), account.timeline_idle_timepoint()); + timeline_resource_->use_barrier_insert( + account.timeline_sem(), account.timeline_idle_timepoint()); // And extend the source use barrier. source_storage.timeline_resource_->use_barrier_insert( account.timeline_sem(), account.timeline_idle_timepoint()); @@ -213,7 +245,7 @@ void storage::AddAsInvocationArgument(local::ProgramInvocation *inv, SHORTFIN_TRACE_SCOPE_NAMED("storage::AddAsInvocationArgument"); iree::vm_opaque_ref ref; *(&ref) = iree_hal_buffer_retain_ref(buffer_); - inv->AddArg(std::move(ref)); + inv->AddArg(std::move(ref), timeline_resource_.get()); AddInvocationArgBarrier(inv, barrier); } @@ -222,23 +254,31 @@ iree_vm_ref_type_t storage::invocation_marshalable_type() { return iree_hal_buffer_type(); } -storage storage::CreateFromInvocationResultRef(local::ProgramInvocation *inv, - iree::vm_opaque_ref ref) { +storage storage::CreateFromInvocationResultRef( + local::ProgramInvocation *inv, + local::CoarseInvocationTimelineImporter *timeline_importer, + iree::vm_opaque_ref ref) { SHORTFIN_TRACE_SCOPE_NAMED("storage::CreateFromInvocationResultRef"); // Steal the ref to one of our smart pointers. // TODO: Should have an opaque_ref::release(). iree::hal_buffer_ptr buffer = iree::hal_buffer_ptr::steal_reference(iree_hal_buffer_deref(*ref.get())); (&ref)->ptr = nullptr; - return ImportInvocationResultStorage(inv, std::move(buffer)); + return ImportInvocationResultStorage(inv, timeline_importer, + std::move(buffer)); } -storage storage::ImportInvocationResultStorage(local::ProgramInvocation *inv, - iree::hal_buffer_ptr buffer) { +storage storage::ImportInvocationResultStorage( + local::ProgramInvocation *inv, + local::CoarseInvocationTimelineImporter *timeline_importer, + iree::hal_buffer_ptr buffer) { SHORTFIN_TRACE_SCOPE_NAMED("storage::ImportInvocationResultStorage"); local::ScopedDevice device = local::ScopedDevice(*inv->fiber(), inv->device_selection()); - auto imported_storage = storage::import_buffer(device, std::move(buffer)); + iree_hal_buffer_t *raw_buffer = buffer.get(); + storage imported_storage( + device, std::move(buffer), + timeline_importer->ImportTimelineResource(raw_buffer)); auto coarse_signal = inv->coarse_signal(); if (coarse_signal.first) { @@ -265,7 +305,6 @@ void storage::AddInvocationArgBarrier(local::ProgramInvocation *inv, inv->DeviceSelect(device_.affinity()); break; case ProgramResourceBarrier::WRITE: - inv->wait_insert(timeline_resource_->mutation_barrier()); inv->wait_insert(timeline_resource_->use_barrier()); inv->DeviceSelect(device_.affinity()); break; diff --git a/shortfin/src/shortfin/array/storage.h b/shortfin/src/shortfin/array/storage.h index 2ea8f5aef..9b6bbe704 100644 --- a/shortfin/src/shortfin/array/storage.h +++ b/shortfin/src/shortfin/array/storage.h @@ -79,9 +79,6 @@ class SHORTFIN_API storage : public local::ProgramInvocationMarshalable { const local::ScopedDevice &device() const { return device_; } local::Fiber &fiber() const { return device_.fiber(); } - static storage import_buffer(local::ScopedDevice &device, - iree::hal_buffer_ptr buffer); - // Allocates device storage, compatible with the given device affinity. // By default, this will be IREE_HAL_MEMORY_TYPE_OPTIMAL_FOR_DEVICE. static storage allocate_device(local::ScopedDevice &device, @@ -178,11 +175,14 @@ class SHORTFIN_API storage : public local::ProgramInvocationMarshalable { private: storage(local::ScopedDevice device, iree::hal_buffer_ptr buffer, local::detail::TimelineResource::Ref timeline_resource); + void AsyncDeallocate(); // ProgramInvocationMarshalable implementation. void AddAsInvocationArgument(local::ProgramInvocation *inv, local::ProgramResourceBarrier barrier) override; - static storage CreateFromInvocationResultRef(local::ProgramInvocation *inv, - iree::vm_opaque_ref ref); + static storage CreateFromInvocationResultRef( + local::ProgramInvocation *inv, + local::CoarseInvocationTimelineImporter *timeline_importer, + iree::vm_opaque_ref ref); static iree_vm_ref_type_t invocation_marshalable_type(); // Adds any necessary wait barriers to the invocation on behalf of this @@ -192,12 +192,11 @@ class SHORTFIN_API storage : public local::ProgramInvocationMarshalable { // Imports a raw hal buffer from an invocation as a storage, attaching any // needed barriers. - static storage ImportInvocationResultStorage(local::ProgramInvocation *inv, - iree::hal_buffer_ptr buffer); + static storage ImportInvocationResultStorage( + local::ProgramInvocation *inv, + local::CoarseInvocationTimelineImporter *timeline_importer, + iree::hal_buffer_ptr buffer); - // The timeline resource holds the back reference to the owning fiber, - // which keeps all devices alive. Buffers must be destroyed before devices, - // so this must be declared first. local::detail::TimelineResource::Ref timeline_resource_; iree::hal_buffer_ptr buffer_; local::ScopedDevice device_; diff --git a/shortfin/src/shortfin/local/fiber.h b/shortfin/src/shortfin/local/fiber.h index afd65b346..077a721ad 100644 --- a/shortfin/src/shortfin/local/fiber.h +++ b/shortfin/src/shortfin/local/fiber.h @@ -127,8 +127,9 @@ class SHORTFIN_API Fiber : public std::enable_shared_from_this { return ScopedDevice(*this, DeviceAffinity(d)); } detail::Scheduler &scheduler() { return scheduler_; } - detail::TimelineResource::Ref NewTimelineResource() { - return scheduler().NewTimelineResource(shared_ptr()); + detail::TimelineResource::Ref NewTimelineResource( + detail::TimelineResourceDestructor destructor = nullptr) { + return scheduler().NewTimelineResource(shared_ptr(), std::move(destructor)); } private: diff --git a/shortfin/src/shortfin/local/program.cc b/shortfin/src/shortfin/local/program.cc index 9744c1691..7cd122e72 100644 --- a/shortfin/src/shortfin/local/program.cc +++ b/shortfin/src/shortfin/local/program.cc @@ -314,10 +314,18 @@ void ProgramInvocation::Deleter::operator()(ProgramInvocation *inst) { // Trailing arg list and result list. The arg list pointer is only available // at construction, so we use the knowledge that it is stored right after // the object. The result_list_ is available for the life of the invocation. - iree_vm_list_deinitialize(static_cast( - static_cast(memory + sizeof(ProgramInvocation)))); + auto *arg_list = static_cast( + static_cast(memory + sizeof(ProgramInvocation))); + iree_host_size_t arg_size = iree_vm_list_size(arg_list); + + iree_vm_list_deinitialize(arg_list); iree_vm_list_deinitialize(inst->result_list_); + // Release any arg resource references. + for (iree_host_size_t i = 0; i < arg_size; ++i) { + if (inst->arg_resources_[i]) inst->arg_resources_[i]->Release(); + } + // Was allocated in New as a uint8_t[] so delete it by whence it came. delete[] memory; } @@ -352,14 +360,15 @@ ProgramInvocation::Ptr ProgramInvocation::New( iree_vm_list_storage_size(&variant_type_def, arg_count); iree_host_size_t result_storage_size = iree_vm_list_storage_size(&variant_type_def, result_count); + iree_host_size_t arg_resource_size = + sizeof(detail::TimelineResource *) * arg_count; // Allocate storage for the ProgramInvocation, arg, result list and placement // new the ProgramInvocation into the storage area. std::unique_ptr inst_storage( new uint8_t[sizeof(ProgramInvocation) + arg_storage_size + - result_storage_size]); + result_storage_size + arg_resource_size]); new (inst_storage.get()) ProgramInvocation(); - // Initialize trailing lists. Abort on failure since this is a bug and we // would otherwise leak. iree_vm_list_t *arg_list; @@ -374,6 +383,10 @@ ProgramInvocation::Ptr ProgramInvocation::New( .data_length = result_storage_size}, &variant_type_def, result_count, &result_list)); + uint8_t *arg_resource_ptr = inst_storage.get() + sizeof(ProgramInvocation) + + arg_storage_size + result_storage_size; + std::memset(arg_resource_ptr, 0, arg_resource_size); + Ptr inst(static_cast( static_cast(inst_storage.release())), Deleter()); @@ -383,6 +396,8 @@ ProgramInvocation::Ptr ProgramInvocation::New( inst->state.params.function = vm_function; inst->state.params.invocation_model = invocation_model; inst->result_list_ = result_list; + inst->arg_resources_ = static_cast( + static_cast(arg_resource_ptr)); return inst; } @@ -392,14 +407,26 @@ void ProgramInvocation::CheckNotScheduled() { } } -void ProgramInvocation::AddArg(iree::vm_opaque_ref ref) { +void ProgramInvocation::AddArg(iree::vm_opaque_ref ref, + detail::TimelineResource *resource) { CheckNotScheduled(); + iree_host_size_t arg_index = iree_vm_list_size(arg_list()); SHORTFIN_THROW_IF_ERROR(iree_vm_list_push_ref_move(arg_list(), &ref)); + if (resource) { + arg_resources_[arg_index] = resource; + resource->Retain(); + } } -void ProgramInvocation::AddArg(iree_vm_ref_t *ref) { +void ProgramInvocation::AddArg(iree_vm_ref_t *ref, + detail::TimelineResource *resource) { CheckNotScheduled(); + iree_host_size_t arg_index = iree_vm_list_size(arg_list()); SHORTFIN_THROW_IF_ERROR(iree_vm_list_push_ref_retain(arg_list(), ref)); + if (resource) { + arg_resources_[arg_index] = resource; + resource->Retain(); + } } iree_status_t ProgramInvocation::FinalizeCallingConvention( @@ -423,6 +450,15 @@ iree_status_t ProgramInvocation::FinalizeCallingConvention( iree_hal_fence_insert(maybe_wait_fence, timeline_sem, timeline_now)); signal_sem_ = sched_account.timeline_sem(); signal_timepoint_ = sched_account.timeline_acquire_timepoint(); + + // Extend any arg resources to our signal timepoint. + iree_host_size_t arg_count = iree_vm_list_size(arg_list); + for (iree_host_size_t i = 0; i < arg_count; ++i) { + detail::TimelineResource *resource = arg_resources_[i]; + if (resource) { + resource->use_barrier_insert(signal_sem_, signal_timepoint_); + } + } } // Push wait fence (or null if no wait needed). @@ -683,6 +719,148 @@ void StaticProgramParameters::LoadMmap(std::filesystem::path file_path, host_allocator_)); } +// -------------------------------------------------------------------------- // +// CoarseInvocationTimelineImporter +// -------------------------------------------------------------------------- // + +CoarseInvocationTimelineImporter::CoarseInvocationTimelineImporter( + ProgramInvocation *inv, Options &options) + : inv_(inv), options_(options) { + SHORTFIN_TRACE_SCOPE_NAMED("CoarseInvocationTimelineImporter"); + auto buffer_type = iree_hal_buffer_type(); + auto buffer_view_type = iree_hal_buffer_view_type(); + for (iree_host_size_t i = 0; i < inv->results_size(); ++i) { + auto ref = inv->result_ref(i); + auto type = ref.get()->type; + if (type == buffer_type) { + iree_hal_buffer_t *buffer = iree_hal_buffer_deref(*ref.get()); + CatalogInternal(buffer, false); + } else if (type == buffer_view_type) { + iree_hal_buffer_view_t *bv = iree_hal_buffer_view_deref(*ref.get()); + iree_hal_buffer_t *buffer = iree_hal_buffer_view_buffer(bv); + CatalogInternal(buffer, true); + } + } + FinalizeCatalog(); +} + +void CoarseInvocationTimelineImporter::CatalogInternal( + iree_hal_buffer_t *user_buffer, bool from_buffer_view) { + SHORTFIN_TRACE_SCOPE_NAMED("CoarseInvocationTimelineImporter::Catalog"); + iree_hal_buffer_t *allocated_buffer = + iree_hal_buffer_allocated_buffer(user_buffer); + allocated_buffer_record &record = allocated_buffers_[allocated_buffer]; + // NOTE: Refcnt snooping is a sordid affair, but we believe is ok in this + // very specific case. This is because we are holding the DAG of live uses + // that we care about and this entire mechanism activates an optimization if + // the number of references in our DAG accounts for *all* outstanding + // references to the backing allocation. If this allocation was off somewhere + // else having its reference count bumped, that could only invalidate the + // optimization vs cause a logic issue. + if (record.primordial_ref_count == 0) { + // Only do the atomic read if not yet initialized. + record.primordial_ref_count = + iree_atomic_ref_count_load(&allocated_buffer->resource.ref_count); + // The reference we are holding. + record.observed_ref_count = 1; + } + if (allocated_buffer != user_buffer) { + // It is a child buffer that contributes to the observed number of + // references. + record.observed_ref_count += from_buffer_view ? 2 : 1; + } else if (from_buffer_view) { + // It is a reference to the root allocation, which is from a buffer view. + record.observed_ref_count += 1; + } +} + +void CoarseInvocationTimelineImporter::FinalizeCatalog() { + for (auto &it : allocated_buffers_) { + iree_hal_buffer_t *allocated_buffer = it.first; + allocated_buffer_record &record = it.second; + + // We can only track for asynchronous dealloc if uniquely owned and + // transferred to us. + if (record.primordial_ref_count != record.observed_ref_count) { + if (options_.assume_no_alias) { + SHORTFIN_SCHED_LOG( + " : Async dealloc override: not unique but assume_no_alias ({} vs " + "{})", + record.primordial_ref_count, record.observed_ref_count); + } else { + SHORTFIN_SCHED_LOG( + " : Synchronous dealloc: Not uniquely owned ({} vs {})", + record.primordial_ref_count, record.observed_ref_count); + continue; + } + } + + // See iree_hal_buffer_placement_t documentation for criteria to satisfy + // tracking for async deallocation. + auto placement = iree_hal_buffer_allocation_placement(allocated_buffer); + iree_hal_device_t *schedule_device = + inv_->device_selection().device()->hal_device(); + auto schedule_affinity = inv_->device_selection().queue_affinity(); + if (!(placement.flags & IREE_HAL_BUFFER_PLACEMENT_FLAG_ASYNCHRONOUS)) { + // Not an async allocation: disable optimization by ignoring this + // entirely. + SHORTFIN_SCHED_LOG( + " : Synchronous dealloc: Buffer not async allocated ({:x})", + placement.flags); + continue; + } + if (placement.device != schedule_device) { + // Placed on a device other than our scheduling timeline. + SHORTFIN_SCHED_LOG( + " : Synchronous dealloc: Schedule vs placement device mismatch ({} " + "vs " + "{})", + static_cast(placement.device), + static_cast(schedule_device)); + continue; + } + if ((placement.queue_affinity & schedule_affinity) != schedule_affinity) { + SHORTFIN_SCHED_LOG( + " : Synchronous dealloc: Schedule affinity not a subset of " + "placement " + "affinity ({:x} vs {:x})", + placement.queue_affinity, schedule_affinity); + continue; + } + + // All checks passed: Set it up for async deallocation. + SHORTFIN_SCHED_LOG( + " : Asynchronous dealloc: Unique, compatible placement"); + ScopedDevice scoped_device(*inv_->fiber(), inv_->device_selection()); + auto dtor = detail::TimelineResource::CreateAsyncBufferDestructor( + scoped_device, + iree::hal_buffer_ptr::borrow_reference(allocated_buffer)); + record.timeline_resource = + inv_->fiber()->NewTimelineResource(std::move(dtor)); + } +} + +detail::TimelineResource::Ref +CoarseInvocationTimelineImporter::ImportTimelineResource( + iree_hal_buffer_t *user_buffer) { + iree_hal_buffer_t *allocated_buffer = + iree_hal_buffer_allocated_buffer(user_buffer); + auto found_it = allocated_buffers_.find(allocated_buffer); + // Fork sync/async to different functions so that the distinction shows up + // vividly in traces. + if (found_it != allocated_buffers_.end() && + found_it->second.timeline_resource) { + SHORTFIN_SCHED_LOG(" : Import async TimelineResource for buffer {}", + static_cast(user_buffer)); + return found_it->second.timeline_resource; + } else { + SHORTFIN_TRACE_SCOPE_NAMED("SyncImportTimelineResource"); + SHORTFIN_SCHED_LOG(" : Import default/sync TimelineResource for buffer {}", + static_cast(user_buffer)); + return inv_->fiber()->NewTimelineResource(); + } +} + // -------------------------------------------------------------------------- // // ProgramIsolate // -------------------------------------------------------------------------- // diff --git a/shortfin/src/shortfin/local/program.h b/shortfin/src/shortfin/local/program.h index f83b83cbd..aa4240107 100644 --- a/shortfin/src/shortfin/local/program.h +++ b/shortfin/src/shortfin/local/program.h @@ -16,6 +16,7 @@ #include "shortfin/local/async.h" #include "shortfin/local/device.h" #include "shortfin/local/program_interfaces.h" +#include "shortfin/local/scheduler.h" #include "shortfin/local/worker.h" #include "shortfin/support/api.h" #include "shortfin/support/iree_helpers.h" @@ -110,14 +111,12 @@ class SHORTFIN_API ProgramInvocation { // thusly are satisfied. void wait_insert(iree_hal_semaphore_list_t sem_list); - // Adds a marshalable argument with a configurable concurrency barrier. - void AddArg(ProgramInvocationMarshalable &marshalable, - ProgramResourceBarrier barrier = ProgramResourceBarrier::READ); - // Adds a ref object argument. This low level interface directly adds a // reference object and does not manipulate any execution barriers. - void AddArg(iree::vm_opaque_ref ref); // Moves a reference in. - void AddArg(iree_vm_ref_t *ref); // Borrows the reference. + void AddArg(iree::vm_opaque_ref ref, + detail::TimelineResource *resource); // Moves a reference in. + void AddArg(iree_vm_ref_t *ref, + detail::TimelineResource *resource); // Borrows the reference. // Transfers ownership of an invocation and schedules it on worker, returning // a future that will resolve to the owned invocation upon completion. @@ -198,6 +197,7 @@ class SHORTFIN_API ProgramInvocation { iree::vm_context_ptr vm_context_; detail::ProgramIsolate *isolate_; iree_vm_list_t *result_list_ = nullptr; + detail::TimelineResource **arg_resources_ = nullptr; std::optional future_; iree::hal_fence_ptr wait_fence_; iree_hal_semaphore_t *signal_sem_ = nullptr; @@ -400,6 +400,59 @@ class SHORTFIN_API StaticProgramParameters : public BaseProgramParameters { void LoadMmap(std::filesystem::path file_path, LoadOptions options); }; +// Handles importing a batch of VM reference types and creating a +// TimelineResource that can be used to anchor their use and deallocation. +// This is intended to be a stack allocated instance used as part of a loop for +// transforming things like a list of function results. +// +// The transformation must be performed in two steps, first by visiting every +// ref and unwinding it to its root iree_hal_buffer_t allocation. Then every +// direct child allocation of this buffer is counted. If the reference count +// of the root buffer == the ref count of all children we can observe, then +// we can claim that the root allocation is uniquely owned by us (the receiver). +// This allows us to make certain claims/optimizations against its timeline. +// Otherwise, if the providence cannot be established, we must treat its +// lifetime conservatively. +// +// Every uniquely owned root buffer will have a new timeline resource created +// for it created with an async destructor, and all of its child buffers will +// share this resource. +// +// Note that this class could be optimized with a small object optimization and +// avoid accounting allocations, but this is left for a future upgrade. +class SHORTFIN_API CoarseInvocationTimelineImporter { + public: + struct Options { + // Assumes that the invocation results cannot alias buffers that are + // otherwise owned and can be assumed to be uniquely allocated for the + // caller. It is possible to compile programs that violate this, and this + // makes certain runtime optimizations impossible. We will restrict this + // in the compiler's calling convention at a certain point so that someone + // cannot accidentally do this. + bool assume_no_alias = true; + }; + CoarseInvocationTimelineImporter(ProgramInvocation *inv, Options &options); + + // Access timeline resources for a buffer. + detail::TimelineResource::Ref ImportTimelineResource( + iree_hal_buffer_t *user_buffer); + + private: + void CatalogInternal(iree_hal_buffer_t *user_buffer, bool from_buffer_view); + void FinalizeCatalog(); + struct allocated_buffer_record { + size_t primordial_ref_count = 0; + size_t observed_ref_count = 0; + // For root buffers, the timeline resource. + detail::TimelineResource::Ref timeline_resource; + }; + + ProgramInvocation *inv_; + Options options_; + std::unordered_map + allocated_buffers_; +}; + namespace detail { // See Fiber::program_isolates_. struct ProgramIsolate { diff --git a/shortfin/src/shortfin/local/program_interfaces.h b/shortfin/src/shortfin/local/program_interfaces.h index 8ab46ab24..4d3477483 100644 --- a/shortfin/src/shortfin/local/program_interfaces.h +++ b/shortfin/src/shortfin/local/program_interfaces.h @@ -16,6 +16,7 @@ namespace shortfin::local { +class CoarseInvocationTimelineImporter; class ProgramInvocation; // The type of barrier that should be managed for a program resource. @@ -63,9 +64,12 @@ class SHORTFIN_API ProgramInvocationMarshalableFactory { // iree::vm_opaque_ref)` static method. The type `T` must be friends with this // class. template - static T CreateFromInvocationResultRef(ProgramInvocation *inv, - iree::vm_opaque_ref ref) { - return T::CreateFromInvocationResultRef(inv, std::move(ref)); + static T CreateFromInvocationResultRef( + ProgramInvocation *inv, + CoarseInvocationTimelineImporter *timeline_importer, + iree::vm_opaque_ref ref) { + return T::CreateFromInvocationResultRef(inv, timeline_importer, + std::move(ref)); } // Gets the type id that corresponds to this marshalable type. diff --git a/shortfin/src/shortfin/local/scheduler.cc b/shortfin/src/shortfin/local/scheduler.cc index 883951a20..6ca10fc0a 100644 --- a/shortfin/src/shortfin/local/scheduler.cc +++ b/shortfin/src/shortfin/local/scheduler.cc @@ -93,8 +93,9 @@ VoidFuture Account::OnSync() { // -------------------------------------------------------------------------- // TimelineResource::TimelineResource(std::shared_ptr fiber, - size_t semaphore_capacity) - : fiber_(std::move(fiber)) { + size_t semaphore_capacity, + TimelineResourceDestructor destructor) + : fiber_(std::move(fiber)), destructor_(std::move(destructor)) { logging::construct("TimelineResource", this); SHORTFIN_THROW_IF_ERROR( iree_hal_fence_create(semaphore_capacity, fiber_->host_allocator(), @@ -103,6 +104,55 @@ TimelineResource::TimelineResource(std::shared_ptr fiber, TimelineResource::~TimelineResource() { logging::destruct("TimelineResource", this); + if (destructor_) { + destructor_(*this); + } +} + +TimelineResourceDestructor TimelineResource::CreateAsyncBufferDestructor( + ScopedDevice &scoped_device, iree::hal_buffer_ptr buffer) { + // The ScopedDevice doesn't lifetime extend the underlying hal device, so + // we must do that manually across the callback (and then release at the end). + iree_hal_device_retain(scoped_device.raw_device()->hal_device()); + return [device_affinity = scoped_device.affinity(), + buffer = std::move(buffer)](TimelineResource &res) { + ScopedDevice scoped_device(*res.fiber(), device_affinity); + iree_hal_device_t *hal_device = scoped_device.raw_device()->hal_device(); + auto queue_affinity = scoped_device.affinity().queue_affinity(); + SHORTFIN_TRACE_SCOPE_NAMED("TimelineResource::AsyncBufferDestructor"); + + // The dealloca needs to wait on the current timepoint+all uses, and it + // needs to signal the next timepoint. Since this is a destructor, we + // now have exclusive control of the TimelineResource and simply record + // the current timepoint in its uses as a (small) optimization for merging + // the wait semaphore list, prior to acquiring a new timepoint + // (to signal). + auto fiber = res.fiber(); + auto &account = fiber->scheduler().GetDefaultAccount(scoped_device); + iree_hal_semaphore_t *timeline_sem = account.timeline_sem(); + res.use_barrier_insert(timeline_sem, account.timeline_idle_timepoint()); + uint64_t signal_timepoint = account.timeline_acquire_timepoint(); + iree_hal_semaphore_list_t wait_semaphore_list = res.use_barrier(); + iree_hal_semaphore_list_t signal_semaphore_list{ + .count = 1, + .semaphores = &timeline_sem, + .payload_values = &signal_timepoint, + }; + + if (SHORTFIN_SCHED_LOG_ENABLED) { + auto wait_sum = iree::DebugPrintSemaphoreList(wait_semaphore_list); + auto signal_sum = iree::DebugPrintSemaphoreList(signal_semaphore_list); + SHORTFIN_SCHED_LOG( + "async dealloca(device={}, affinity={:x}, buffer={}):[Wait:{}, " + "Signal:{}]", + static_cast(hal_device), queue_affinity, + static_cast(buffer.get()), wait_sum, signal_sum); + } + SHORTFIN_THROW_IF_ERROR(iree_hal_device_queue_dealloca( + hal_device, queue_affinity, wait_semaphore_list, signal_semaphore_list, + buffer)); + iree_hal_device_release(hal_device); + }; } void TimelineResource::use_barrier_insert(iree_hal_semaphore_t *sem, diff --git a/shortfin/src/shortfin/local/scheduler.h b/shortfin/src/shortfin/local/scheduler.h index cd493a41b..17881955b 100644 --- a/shortfin/src/shortfin/local/scheduler.h +++ b/shortfin/src/shortfin/local/scheduler.h @@ -24,6 +24,7 @@ namespace detail { class Account; class Scheduler; +class TimelineResource; // Transactions are accumulated into a command buffer by type and in // auto-flush mode, the command buffer is submitted upon a change of type. @@ -53,6 +54,10 @@ enum class TransactionMode { EXPLICIT = 1, }; +// Destructor callback to be invoked just before the timeline resource is +// destroyed. +using TimelineResourceDestructor = std::function; + // Control object for a resource that is tracked on the timeline. Each such // resource is associated with a single Account. In the case of a resource // that is shared across device queues, a consistent account will be chosen @@ -81,8 +86,13 @@ class SHORTFIN_API TimelineResource { class SHORTFIN_API Ref { public: Ref() : res_(nullptr) {} - explicit Ref(TimelineResource *res) : res_(res) { res_->Retain(); } - Ref(const Ref &other) : res_(other.res_) { res_->Retain(); } + explicit Ref(TimelineResource *res) : res_(res) { + if (res_) res_->Retain(); + } + Ref(const Ref &other) : res_(other.res_) { + if (res_) res_->Retain(); + } + operator bool() const { return res_ != nullptr; } Ref &operator=(const Ref &other) { if (other.res_ != res_) { reset(); @@ -104,6 +114,7 @@ class SHORTFIN_API TimelineResource { Ref(Ref &&other) : res_(other.res_) { other.res_ = nullptr; } ~Ref() { reset(); } TimelineResource *operator->() { return res_; } + TimelineResource *get() { return res_; } void reset() { if (res_) { @@ -117,7 +128,14 @@ class SHORTFIN_API TimelineResource { }; TimelineResource(TimelineResource &other) = delete; - // Sets the mutation barrier. + // Creates an asynchronous buffer destructor for this resource. When the + // resource is about to be destroyed, an async dealloca will be issued at + // the use barrier. + static TimelineResourceDestructor CreateAsyncBufferDestructor( + ScopedDevice &scoped_device, iree::hal_buffer_ptr buffer); + + // Sets the mutation barrier. The mutation barrier is the point on a timeline + // beyond which there are no further writes. // Note that the semaphore set in this way is not retained as it is // assumed to be part of the local scheduler. void set_mutation_barrier(iree_hal_semaphore_t *sem, uint64_t timepoint) { @@ -135,6 +153,8 @@ class SHORTFIN_API TimelineResource { } } + // Barrier beyond which there are no further uses of the resource, including + // both reads and writes. // Use barrier can have new timepoints inserted or converted to a // semaphore list. void use_barrier_insert(iree_hal_semaphore_t *sem, uint64_t timepoint); @@ -144,14 +164,17 @@ class SHORTFIN_API TimelineResource { iree_allocator_t host_allocator(); - private: - TimelineResource(std::shared_ptr fiber, size_t semaphore_capacity); - ~TimelineResource(); void Retain() { refcnt_++; } void Release() { if (--refcnt_ == 0) delete this; } + Fiber *fiber() { return fiber_.get(); } + + private: + TimelineResource(std::shared_ptr fiber, size_t semaphore_capacity, + TimelineResourceDestructor destructor); + ~TimelineResource(); int refcnt_ = 0; // Back reference to the owning fiber. @@ -166,6 +189,9 @@ class SHORTFIN_API TimelineResource { // Use barrier fence. The fact that this is a fence object with a fixed // capacity is an implementation detail. iree::hal_fence_ptr use_barrier_fence_; + + // Destructor to be called just prior to the TimelineResource being destroyed. + TimelineResourceDestructor destructor_; friend class Scheduler; }; @@ -261,9 +287,11 @@ class SHORTFIN_API Scheduler { // Gets a fresh TimelineResource which can be used for tracking resource // read/write and setting barriers. Note that these are all allocated fresh // on each call today but may be pooled in the future. - TimelineResource::Ref NewTimelineResource(std::shared_ptr fiber) { - return TimelineResource::Ref( - new TimelineResource(std::move(fiber), semaphore_count_)); + TimelineResource::Ref NewTimelineResource( + std::shared_ptr fiber, + TimelineResourceDestructor destructor = nullptr) { + return TimelineResource::Ref(new TimelineResource( + std::move(fiber), semaphore_count_, std::move(destructor))); } // Creates a new fence with capacity for all semaphores that are extant at diff --git a/shortfin/src/shortfin/support/iree_helpers.cc b/shortfin/src/shortfin/support/iree_helpers.cc index 17430bb71..81406210e 100644 --- a/shortfin/src/shortfin/support/iree_helpers.cc +++ b/shortfin/src/shortfin/support/iree_helpers.cc @@ -7,6 +7,7 @@ #include "shortfin/support/iree_helpers.h" #include +#include #include #include @@ -106,4 +107,13 @@ void error::AppendStatusMessage() { } } +std::string DebugPrintSemaphoreList(iree_hal_semaphore_list_t &sl) { + std::vector parts; + for (unsigned i = 0; i < sl.count; ++i) { + parts.push_back(fmt::format("{}@{}", static_cast(sl.semaphores[i]), + sl.payload_values[i])); + } + return fmt::format("({})", fmt::join(parts, ", ")); +} + } // namespace shortfin::iree diff --git a/shortfin/src/shortfin/support/iree_helpers.h b/shortfin/src/shortfin/support/iree_helpers.h index f8d3f1398..c36bf8b6a 100644 --- a/shortfin/src/shortfin/support/iree_helpers.h +++ b/shortfin/src/shortfin/support/iree_helpers.h @@ -354,6 +354,12 @@ using vm_opaque_ref = ::iree::vm::opaque_ref; template using vm_ref = ::iree::vm::ref; +// -------------------------------------------------------------------------- // +// Debugging +// -------------------------------------------------------------------------- // + +std::string DebugPrintSemaphoreList(iree_hal_semaphore_list_t &sl); + } // namespace iree } // namespace shortfin diff --git a/shortfin/tests/invocation/mobilenet_program_test.py b/shortfin/tests/invocation/mobilenet_program_test.py index 502843bc3..686a0a1fc 100644 --- a/shortfin/tests/invocation/mobilenet_program_test.py +++ b/shortfin/tests/invocation/mobilenet_program_test.py @@ -89,6 +89,7 @@ async def main(): device_input = get_mobilenet_ref_input(device) (device_output,) = await mobilenet_program_function(device_input, fiber=fiber0) await assert_mobilenet_ref_output(device, device_output) + del device_output lsys.run(main()) @@ -106,6 +107,7 @@ async def main(): device_input, fiber=fiber0 ) await assert_mobilenet_ref_output(device, device_output) + del device_output lsys.run(main()) @@ -123,12 +125,14 @@ async def main(): for _ in range(5) ] - await asyncio.gather( + gather = asyncio.gather( *[ assert_mobilenet_ref_output(device, device_output) for (device_output,) in results ] ) + del results + await gather lsys.run(main()) @@ -157,12 +161,14 @@ async def main(): ] ) - await asyncio.gather( + gather = asyncio.gather( *[ assert_mobilenet_ref_output(device, device_output) for (device_output,) in results ] ) + del results + await gather lsys.run(main()) @@ -189,12 +195,14 @@ async def main(): ] ) - await asyncio.gather( + gather = asyncio.gather( *[ assert_mobilenet_ref_output(device, device_output) for (device_output,) in results ] ) + del results + await gather lsys.run(main()) @@ -221,6 +229,7 @@ def duration(): ) print(f"{self}: Program complete (+{duration()}ms)") await assert_mobilenet_ref_output(device, device_output) + del device_output print(f"{self} End (+{duration()}ms)") async def main():