diff --git a/libhrx/src/binding/common/event.c b/libhrx/src/binding/common/event.c index 6c6553d08..74e9b76e9 100644 --- a/libhrx/src/binding/common/event.c +++ b/libhrx/src/binding/common/event.c @@ -160,10 +160,11 @@ iree_status_t iree_hal_streaming_event_record( IREE_RETURN_AND_END_ZONE_IF_ERROR(z0, iree_hal_streaming_stream_flush(stream)); + iree_slim_mutex_lock(&stream->mutex); + // Use stream's current pending value as wait value and increment for signal. uint64_t wait_value = stream->pending_value; event->signal_value = wait_value + 1; - stream->pending_value = event->signal_value; // Create a queue barrier to signal the event semaphore. // This waits for the stream's last submission to complete before signaling. @@ -181,10 +182,19 @@ iree_status_t iree_hal_streaming_event_record( .payload_values = signal_values, }; - IREE_RETURN_AND_END_ZONE_IF_ERROR( - z0, iree_hal_device_queue_barrier( - stream->context->device, stream->queue_affinity, wait_semaphores, - signal_semaphores, IREE_HAL_EXECUTE_FLAG_NONE)); + iree_status_t status = iree_hal_device_queue_barrier( + stream->context->device, stream->queue_affinity, wait_semaphores, + signal_semaphores, IREE_HAL_EXECUTE_FLAG_NONE); + if (iree_status_is_ok(status)) { + status = iree_hal_device_queue_flush(stream->context->device, + stream->queue_affinity); + } + if (iree_status_is_ok(status)) { + stream->pending_value = event->signal_value; + stream->submitted_value = event->signal_value; + } + iree_slim_mutex_unlock(&stream->mutex); + IREE_RETURN_AND_END_ZONE_IF_ERROR(z0, status); IREE_TRACE_ZONE_END(z0); return iree_ok_status(); diff --git a/libhrx/src/binding/common/graph.c b/libhrx/src/binding/common/graph.c index c924f733e..7a9c50721 100644 --- a/libhrx/src/binding/common/graph.c +++ b/libhrx/src/binding/common/graph.c @@ -1029,6 +1029,10 @@ static iree_status_t iree_hal_streaming_pack_raw_argument_list( iree_host_size_t* out_constants_size) { IREE_ASSERT_ARGUMENT(parameters); IREE_ASSERT_ARGUMENT(out_constants_size); + if (iree_hal_streaming_parameter_info_is_empty(parameters)) { + *out_constants_size = 0; + return iree_ok_status(); + } *out_constants_size = parameters->direct_arg_bytes ? parameters->direct_arg_bytes : parameters->constant_bytes; @@ -1038,7 +1042,9 @@ static iree_status_t iree_hal_streaming_pack_raw_argument_list( if (*out_constants_size == 0) { return iree_ok_status(); } - if (!parameter_list || !out_constants) { + if (!out_constants || (!parameter_list && (parameters->buffer_size > 0 || + parameters->binding_count > 0 || + parameters->copy_count > 0))) { return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, "raw kernel arguments require parameter storage"); } @@ -1109,11 +1115,14 @@ iree_status_t iree_hal_streaming_graph_add_kernel_node( (params->flags & IREE_HAL_STREAMING_DISPATCH_FLAG_ARGS_ARRAY) != 0; const bool is_native_kernel = symbol->parameters.binding_count == 0 && symbol->parameters.copy_count == 0; - if (is_args_array && is_native_kernel && params->buffer) { + const bool is_empty_native_kernel = + is_native_kernel && + iree_hal_streaming_parameter_info_is_empty(&symbol->parameters); + if (is_args_array && is_native_kernel && !is_empty_native_kernel) { IREE_TRACE_ZONE_END(z0); return iree_make_status( IREE_STATUS_UNIMPLEMENTED, - "args-array graph kernel launch requires parameter metadata"); + "non-empty args-array graph kernel launch requires parameter metadata"); } iree_host_size_t constants_capacity = symbol->parameters.constant_bytes; @@ -1180,7 +1189,9 @@ iree_status_t iree_hal_streaming_graph_add_kernel_node( attrs->constants_capacity = constants_capacity; attrs->bindings.count = symbol->parameters.binding_count; attrs->bindings.values = - (iree_hal_buffer_ref_t*)(extra_data + constants_size); + symbol->parameters.binding_count + ? (iree_hal_buffer_ref_t*)(extra_data + constants_size) + : NULL; attrs->binding_capacity = symbol->parameters.binding_count; iree_status_t unpack_status = iree_ok_status(); if (is_pre_packed && params->buffer) { @@ -1195,6 +1206,10 @@ iree_status_t iree_hal_streaming_graph_add_kernel_node( } attrs->constants = iree_make_const_byte_span(constants, captured_size); attrs->bindings = iree_hal_buffer_ref_list_empty(); + } else if (is_args_array && is_empty_native_kernel) { + // HIP host stubs may pass a {NULL} args array for no-argument kernels. + attrs->constants = iree_make_const_byte_span(constants, 0); + attrs->bindings = iree_hal_buffer_ref_list_empty(); } else if (is_args_array) { unpack_status = iree_hal_streaming_unpack_parameter_list( graph->context, &symbol->parameters, (void**)params->buffer, constants, @@ -1273,10 +1288,13 @@ iree_status_t iree_hal_streaming_graph_set_kernel_node_params( (params->flags & IREE_HAL_STREAMING_DISPATCH_FLAG_ARGS_ARRAY) != 0; const bool is_native_kernel = symbol->parameters.binding_count == 0 && symbol->parameters.copy_count == 0; - if (is_args_array && is_native_kernel && params->buffer) { + const bool is_empty_native_kernel = + is_native_kernel && + iree_hal_streaming_parameter_info_is_empty(&symbol->parameters); + if (is_args_array && is_native_kernel && !is_empty_native_kernel) { return iree_make_status( IREE_STATUS_UNIMPLEMENTED, - "args-array graph kernel launch requires parameter metadata"); + "non-empty args-array graph kernel launch requires parameter metadata"); } iree_host_size_t constants_capacity = symbol->parameters.constant_bytes; @@ -1316,6 +1334,10 @@ iree_status_t iree_hal_streaming_graph_set_kernel_node_params( } constants_span = iree_make_const_byte_span(constants, captured_size); bindings = iree_hal_buffer_ref_list_empty(); + } else if (is_args_array && is_empty_native_kernel) { + // HIP host stubs may pass a {NULL} args array for no-argument kernels. + constants_span = iree_make_const_byte_span(constants, 0); + bindings = iree_hal_buffer_ref_list_empty(); } else if (is_args_array) { unpack_status = iree_hal_streaming_unpack_parameter_list( node->graph->context, &symbol->parameters, (void**)params->buffer, diff --git a/libhrx/src/binding/common/internal.h b/libhrx/src/binding/common/internal.h index 79d3a0ba8..8cbbb42a5 100644 --- a/libhrx/src/binding/common/internal.h +++ b/libhrx/src/binding/common/internal.h @@ -603,6 +603,15 @@ typedef struct iree_hal_streaming_parameter_info_t { iree_hal_streaming_parameter_op_t* ops; } iree_hal_streaming_parameter_info_t; +// True when launch metadata describes no parameters in either HAL binding form +// or native direct-argument form. +static inline bool iree_hal_streaming_parameter_info_is_empty( + const iree_hal_streaming_parameter_info_t* parameters) { + return parameters->buffer_size == 0 && parameters->constant_bytes == 0 && + parameters->direct_arg_bytes == 0 && parameters->binding_count == 0 && + parameters->copy_count == 0; +} + // Symbol metadata structure. typedef struct iree_hal_streaming_symbol_t { // Parent module. Unowned. diff --git a/libhrx/src/binding/common/memory.c b/libhrx/src/binding/common/memory.c index bfa33af14..5ccac46bb 100644 --- a/libhrx/src/binding/common/memory.c +++ b/libhrx/src/binding/common/memory.c @@ -907,27 +907,14 @@ static iree_status_t iree_hal_streaming_memory_allocate_host_with_context_mode( .min_alignment = host_alignment, }; - void* host_ptr = NULL; - IREE_RETURN_AND_END_ZONE_IF_ERROR( - z0, iree_allocator_malloc_aligned(context->host_allocator, - allocation_size, host_alignment, - /*offset=*/0, &host_ptr)); - iree_hal_buffer_t* buffer = NULL; - iree_hal_external_buffer_t external_buffer = { - .type = IREE_HAL_EXTERNAL_BUFFER_TYPE_HOST_ALLOCATION, - .flags = IREE_HAL_EXTERNAL_BUFFER_FLAG_NONE, - .size = (iree_device_size_t)allocation_size, - .handle.host_allocation.ptr = host_ptr, - }; - iree_status_t status = iree_hal_allocator_import_buffer( - context->device_allocator, params, &external_buffer, - iree_hal_buffer_release_callback_null(), &buffer); + iree_status_t status = iree_hal_allocator_allocate_buffer( + context->device_allocator, params, allocation_size, &buffer); iree_hal_streaming_buffer_t* wrapper = NULL; if (iree_status_is_ok(status)) { status = iree_hal_streaming_buffer_wrap( - context, buffer, (int)memory_type, host_ptr, + context, buffer, (int)memory_type, /*imported_host_ptr=*/NULL, /*allocation_pool=*/NULL, context_ownership, &wrapper); } iree_hal_buffer_release(buffer); @@ -939,17 +926,14 @@ static iree_status_t iree_hal_streaming_memory_allocate_host_with_context_mode( } if (iree_status_is_ok(status)) { - wrapper->owns_host_ptr = true; wrapper->imported_host_allocation = false; wrapper->host_register_flags = flags; *out_buffer = wrapper; - host_ptr = NULL; } else { if (wrapper) { hrx_buffer_table_remove(&context->buffer_table, wrapper->device_ptr); iree_hal_streaming_buffer_free(wrapper); } - iree_allocator_free_aligned(context->host_allocator, host_ptr); } IREE_TRACE_ZONE_END(z0); return status; diff --git a/libhrx/src/binding/common/module.c b/libhrx/src/binding/common/module.c index c4e2f9e40..41b9b69a9 100644 --- a/libhrx/src/binding/common/module.c +++ b/libhrx/src/binding/common/module.c @@ -255,6 +255,7 @@ static iree_status_t iree_hal_streaming_module_extract_metadata( for (iree_host_size_t i = 0, parameter_base = 0; iree_status_is_ok(status) && i < module->symbol_count; ++i) { iree_hal_streaming_symbol_t* symbol = &module->symbols[i]; + memset(symbol, 0, sizeof(*symbol)); symbol->module = module; symbol->name = export_infos[i].name; symbol->type = IREE_HAL_STREAMING_SYMBOL_TYPE_FUNCTION; @@ -272,6 +273,12 @@ static iree_status_t iree_hal_streaming_module_extract_metadata( // Initialize parameter info. iree_hal_streaming_parameter_info_t* parameter_info = &symbol->parameters; + if (export_infos[i].constant_byte_length > UINT16_MAX) { + status = iree_make_status( + IREE_STATUS_OUT_OF_RANGE, + "function constant metadata exceeds supported parameter size"); + continue; + } // Executable binding_count describes normal HAL dispatch bindings. HRX's // unpacker needs the number of reflected BINDING parameters it will // resolve from the HIP launch ABI. @@ -289,6 +296,7 @@ static iree_status_t iree_hal_streaming_module_extract_metadata( // Build operations with coalescing. // Copy ops go first, then resolve ops. uint16_t src_offset = 0; + size_t direct_arg_offset = 0; uint16_t buffer_size = 0; size_t this_kernel_direct_arg_size = 0; // Native direct-arg prefix size. iree_hal_streaming_parameter_op_t* copy_ops_start = current_ops; @@ -296,7 +304,8 @@ static iree_status_t iree_hal_streaming_module_extract_metadata( current_ops + symbol_op_counts[i].copy_count; uint16_t copy_count = 0; uint16_t resolve_count = 0; - for (uint16_t j = 0; j < parameter_count; ++j) { + for (uint16_t j = 0; iree_status_is_ok(status) && j < parameter_count; + ++j) { const iree_hal_executable_export_parameter_t* parameter = ¶meters[parameter_base + j]; const bool is_binding_parameter = @@ -305,6 +314,19 @@ static iree_status_t iree_hal_streaming_module_extract_metadata( parameter->type == IREE_HAL_EXECUTABLE_EXPORT_PARAMETER_TYPE_BUFFER_PTR && resolve_count < export_infos[i].binding_count; + size_t native_dst_offset = direct_arg_offset; + if (is_buffer_binding_parameter) { + native_dst_offset = parameter->offset; + } + const size_t source_extent = (size_t)src_offset + parameter->size; + const size_t native_extent = native_dst_offset + parameter->size; + if (source_extent > UINT16_MAX || native_dst_offset > UINT16_MAX || + native_extent > UINT16_MAX) { + status = iree_make_status( + IREE_STATUS_OUT_OF_RANGE, + "function parameter metadata exceeds supported argument size"); + break; + } if (is_binding_parameter || is_buffer_binding_parameter) { // Update offsets. Bindings are passed as pointers. // |parameter->offset| is the kernarg byte offset for all parameter @@ -314,25 +336,27 @@ static iree_status_t iree_hal_streaming_module_extract_metadata( // exactly the index of this parameter in the bindings list. iree_hal_streaming_parameter_resolve_op_t* op = &resolve_ops_start[resolve_count].resolve; + op->reserved = 0; op->src_offset = src_offset; op->dst_ordinal = resolve_count; op->src_ordinal = j; - // For HIP/CUDA native launches using CUSTOM_DIRECT_ARGUMENTS we need + // For HIP native launches using CUSTOM_DIRECT_ARGUMENTS we need // to place raw device pointers at their kernarg ABI offset. Binding // export parameter offsets are binding-list ordinals in some IREE HAL // backends, not byte offsets, so use the packed source offset we // calculate from the full parameter sequence. AMDGPU BUFFER_PTR // parameters already carry native kernarg byte offsets. - op->dst_offset = - is_buffer_binding_parameter ? parameter->offset : src_offset; - src_offset += parameter->size; + op->dst_offset = (uint16_t)native_dst_offset; + src_offset = (uint16_t)source_extent; buffer_size = src_offset; ++resolve_count; - size_t param_extent = (size_t)op->dst_offset + parameter->size; + size_t param_extent = native_extent; if (param_extent > this_kernel_direct_arg_size) { this_kernel_direct_arg_size = param_extent; } + direct_arg_offset = + iree_max(param_extent, direct_arg_offset + parameter->size); } else { // TODO: fix coalescing. It does not work when we have // parameter arrays because each constant comes in as a @@ -349,27 +373,31 @@ static iree_status_t iree_hal_streaming_module_extract_metadata( op->size = parameter->size; op->src_offset = src_offset; op->src_ordinal = j; - op->direct_dst_offset = src_offset; + op->direct_dst_offset = (uint16_t)native_dst_offset; op->dst_offset = parameter->offset; // offset in constants ++copy_count; // active_copy = op; // } - src_offset += parameter->size; + src_offset = (uint16_t)source_extent; buffer_size = src_offset; - size_t direct_arg_extent = - (size_t)op->direct_dst_offset + parameter->size; + size_t direct_arg_extent = native_extent; if (direct_arg_extent > this_kernel_direct_arg_size) { this_kernel_direct_arg_size = direct_arg_extent; } + direct_arg_offset = + iree_max(direct_arg_extent, direct_arg_offset + parameter->size); } } - parameter_info->buffer_size = buffer_size; - parameter_info->constant_bytes = export_infos[i].constant_byte_length; - if (buffer_size > this_kernel_direct_arg_size) { - this_kernel_direct_arg_size = buffer_size; + if (iree_status_is_ok(status)) { + parameter_info->buffer_size = buffer_size; + parameter_info->constant_bytes = + (uint16_t)export_infos[i].constant_byte_length; + if (buffer_size > this_kernel_direct_arg_size) { + this_kernel_direct_arg_size = buffer_size; + } + parameter_info->direct_arg_bytes = (uint16_t)this_kernel_direct_arg_size; } - parameter_info->direct_arg_bytes = this_kernel_direct_arg_size; // Advance to next symbol's ops. parameter_base += parameter_count; @@ -412,11 +440,11 @@ iree_status_t iree_hal_streaming_module_create_from_memory( module->cache = context->executable_cache; iree_hal_executable_cache_retain(module->cache); - // HIP / CUDA hand us anything the toolchain emits — raw AMDGPU ELFs, + // HIP toolchains hand us several container formats: raw AMDGPU ELFs, // __CLANG_OFFLOAD_BUNDLE__ archives, CCOB (zstd-compressed bundles), and - // __hipFatBinaryWrapper-wrapped combinations of all of the above. Unwrap - // everything here and only forward raw ELF plus an explicit executable - // format to the HAL executable cache. + // __hipFatBinaryWrapper-wrapped combinations of those. Unwrap everything here + // and only forward raw ELF plus an explicit executable format to the HAL + // executable cache. iree_const_byte_span_t executable_data = image; const char* executable_format = NULL; const bool try_fat_unwrap = context->device_entry != NULL && diff --git a/libhrx/src/binding/common/stream.c b/libhrx/src/binding/common/stream.c index 3c1292553..4546fa244 100644 --- a/libhrx/src/binding/common/stream.c +++ b/libhrx/src/binding/common/stream.c @@ -637,12 +637,12 @@ iree_status_t iree_hal_streaming_stream_wait_event( IREE_RETURN_AND_END_ZONE_IF_ERROR(z0, iree_hal_streaming_stream_flush(stream)); - // Get the current stream pending value to signal after waiting for the event. - uint64_t signal_value = stream->pending_value + 1; - stream->pending_value = signal_value; + iree_slim_mutex_lock(&stream->mutex); - // Create a queue barrier that waits for the event and signals the stream. - // This ensures the stream continues only after the event is signaled. + // Reserve the next stream timeline value and submit a barrier that completes + // only after the event is signaled. The value is submitted, not completed; + // query/synchronize advance completed_value after observing the semaphore. + uint64_t signal_value = stream->pending_value + 1; iree_hal_semaphore_list_t wait_semaphores = { .count = 1, .semaphores = &event->semaphore, @@ -654,13 +654,20 @@ iree_status_t iree_hal_streaming_stream_wait_event( .payload_values = &signal_value, }; - IREE_RETURN_AND_END_ZONE_IF_ERROR( - z0, iree_hal_device_queue_barrier( - stream->context->device, stream->queue_affinity, wait_semaphores, - signal_semaphores, IREE_HAL_EXECUTE_FLAG_NONE)); + iree_status_t status = iree_hal_device_queue_barrier( + stream->context->device, stream->queue_affinity, wait_semaphores, + signal_semaphores, IREE_HAL_EXECUTE_FLAG_NONE); + if (iree_status_is_ok(status)) { + status = iree_hal_device_queue_flush(stream->context->device, + stream->queue_affinity); + } + if (iree_status_is_ok(status)) { + stream->pending_value = signal_value; + stream->submitted_value = signal_value; + } - // Update completed value to track this barrier. - stream->completed_value = signal_value; + iree_slim_mutex_unlock(&stream->mutex); + IREE_RETURN_AND_END_ZONE_IF_ERROR(z0, status); IREE_TRACE_ZONE_END(z0); return iree_ok_status(); @@ -828,11 +835,32 @@ iree_status_t iree_hal_streaming_unpack_parameters( iree_hal_buffer_ref_list_t* out_bindings) { IREE_ASSERT_ARGUMENT(context); IREE_ASSERT_ARGUMENT(parameters); - if (parameters->buffer_size == 0) { + if (iree_hal_streaming_parameter_info_is_empty(parameters)) { return iree_ok_status(); } - IREE_ASSERT_ARGUMENT(parameter_buffer_ptr); + const bool requires_parameter_storage = parameters->buffer_size > 0 || + parameters->binding_count > 0 || + parameters->copy_count > 0; + if (!requires_parameter_storage) { + return iree_ok_status(); + } + if (!parameter_buffer_ptr) { + return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, + "kernel parameter buffer is required"); + } + if (parameters->copy_count > 0 && !out_constants) { + return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, + "kernel constant storage is required"); + } IREE_ASSERT_ARGUMENT(out_bindings); + if (!out_bindings) { + return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, + "kernel binding list is required"); + } + if (parameters->binding_count > 0 && !out_bindings->values) { + return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, + "kernel binding storage is required"); + } const uint8_t* parameter_buffer = (const uint8_t*)parameter_buffer_ptr; @@ -844,8 +872,10 @@ iree_status_t iree_hal_streaming_unpack_parameters( const iree_hal_streaming_parameter_op_t* op = ¶meters->ops[0]; for (uint32_t i = 0; i < parameters->copy_count; ++i, ++op) { const iree_hal_streaming_parameter_copy_op_t copy_op = op->copy; - memcpy(constants + copy_op.dst_offset, - parameter_buffer + copy_op.src_offset, copy_op.size); + if (copy_op.size > 0) { + memcpy(constants + copy_op.dst_offset, + parameter_buffer + copy_op.src_offset, copy_op.size); + } } // Resolve bindings, if any. @@ -885,11 +915,32 @@ iree_status_t iree_hal_streaming_unpack_parameter_list( iree_hal_buffer_ref_list_t* out_bindings) { IREE_ASSERT_ARGUMENT(context); IREE_ASSERT_ARGUMENT(parameters); - if (parameters->buffer_size == 0) { + if (iree_hal_streaming_parameter_info_is_empty(parameters)) { return iree_ok_status(); } - IREE_ASSERT_ARGUMENT(parameter_list); + const bool requires_parameter_storage = parameters->buffer_size > 0 || + parameters->binding_count > 0 || + parameters->copy_count > 0; + if (!requires_parameter_storage) { + return iree_ok_status(); + } + if (!parameter_list) { + return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, + "kernel parameter list is required"); + } + if (parameters->copy_count > 0 && !out_constants) { + return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, + "kernel constant storage is required"); + } IREE_ASSERT_ARGUMENT(out_bindings); + if (!out_bindings) { + return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, + "kernel binding list is required"); + } + if (parameters->binding_count > 0 && !out_bindings->values) { + return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, + "kernel binding storage is required"); + } // When parameters are provided as an array of pointers, each element in the // array points to the actual parameter value. Metadata-described pointer @@ -906,7 +957,14 @@ iree_status_t iree_hal_streaming_unpack_parameter_list( // array. Each parameter_list[index] is a pointer to the actual value. // We need to dereference it to get the value. void* param_ptr = parameter_list[copy_op.src_ordinal]; - memcpy(constants + copy_op.dst_offset, param_ptr, copy_op.size); + if (!param_ptr && copy_op.size > 0) { + return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, + "kernel argument %" PRIu32 " is NULL", + (uint32_t)copy_op.src_ordinal); + } + if (copy_op.size > 0) { + memcpy(constants + copy_op.dst_offset, param_ptr, copy_op.size); + } } // Resolve bindings, if any. @@ -920,6 +978,11 @@ iree_status_t iree_hal_streaming_unpack_parameter_list( const iree_hal_streaming_parameter_resolve_op_t resolve_op = op->resolve; // In pointer array mode, src_offset is an index into the parameter_list. void* param_ptr = parameter_list[resolve_op.src_ordinal]; + if (!param_ptr) { + return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, + "kernel argument %" PRIu32 " is NULL", + (uint32_t)resolve_op.src_ordinal); + } // The parameter points to a device pointer (void*) void* device_ptr = *(void**)param_ptr; // Kernel metadata identifies pointer slots but not the dynamic object @@ -950,11 +1013,18 @@ static iree_status_t iree_hal_streaming_pack_raw_argument_list( IREE_ASSERT_ARGUMENT(parameters); IREE_ASSERT_ARGUMENT(out_constants_size); + if (iree_hal_streaming_parameter_info_is_empty(parameters)) { + *out_constants_size = 0; + return iree_ok_status(); + } + *out_constants_size = parameters->direct_arg_bytes ? parameters->direct_arg_bytes : parameters->constant_bytes; if (*out_constants_size == 0) return iree_ok_status(); - if (!parameter_list || !out_constants) { + if (!out_constants || (!parameter_list && (parameters->buffer_size > 0 || + parameters->binding_count > 0 || + parameters->copy_count > 0))) { return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, "raw kernel arguments require parameter storage"); } @@ -1087,6 +1157,9 @@ iree_status_t iree_hal_streaming_launch_kernel( // Native kernels have no bindings and no copy operations. bool is_native_kernel = (symbol->parameters.binding_count == 0 && symbol->parameters.copy_count == 0); + const bool is_empty_native_kernel = + is_native_kernel && + iree_hal_streaming_parameter_info_is_empty(&symbol->parameters); size_t constants_size = symbol->parameters.constant_bytes; // Track if we need to use raw argument passing (e.g., for external pointers). @@ -1120,11 +1193,11 @@ iree_status_t iree_hal_streaming_launch_kernel( binding_list.count = 0; // No IREE bindings, using raw pointers. use_raw_arguments = true; } else if (params->flags & IREE_HAL_STREAMING_DISPATCH_FLAG_ARGS_ARRAY) { - if (is_native_kernel && params->buffer) { + if (is_native_kernel && params->buffer && !is_empty_native_kernel) { IREE_TRACE_ZONE_END(z0); return iree_make_status( IREE_STATUS_UNIMPLEMENTED, - "args-array kernel launch requires parameter metadata"); + "non-empty args-array kernel launch requires parameter metadata"); } // Unpack parameters from array of pointers (void**). iree_status_t unpack_status = iree_hal_streaming_unpack_parameter_list( @@ -1305,11 +1378,9 @@ iree_status_t iree_hal_streaming_launch_kernel( // important: under the AMDGPU HAL backend it resolves to an AGENT-scoped // AQL release+acquire fence between this dispatch and the next, which // flushes the GPU L1/L2 caches so the next dispatch sees this dispatch's - // writes. A bare execution barrier with no memory_barriers (count=0) - // resolves to NONE/NONE scopes after upstream IREE commit 48af1651a1 - // ("Preserve command-buffer barrier scopes") and lets later dispatches - // launch with stale cache state, producing garbage output (e.g. NaN - // logits in GPT-2 forward). + // writes. A bare execution barrier with no memory barriers does not publish + // dispatch memory side effects under backends that preserve empty barrier + // scopes, so later dispatches can observe stale device cache contents. if (iree_status_is_ok(status) && !hrx_disable_dispatch_barrier_enabled()) { static const iree_hal_memory_barrier_t memory_barrier = { .source_scope = IREE_HAL_ACCESS_SCOPE_DISPATCH_READ | diff --git a/libhrx/src/binding/hip/api.c b/libhrx/src/binding/hip/api.c index 652dbb093..f789b3e2f 100644 --- a/libhrx/src/binding/hip/api.c +++ b/libhrx/src/binding/hip/api.c @@ -4225,7 +4225,7 @@ static void iree_hip_managed_fill_accessed_by(uint64_t mask, int* devices, // - Memory persists until freed with hipFree(). // - Memory is accessible from all streams on the device. // - Allocation is aligned to meet all alignment requirements. -// - Zero-size allocations are allowed and return a valid pointer. +// - Zero-size allocations are allowed and return NULL. // // Multi-GPU: Memory is allocated on the current device. // @@ -4258,6 +4258,12 @@ HIPAPI hipError_t hipMalloc(void** ptr, size_t size) { HIP_RETURN_ERROR(hipErrorStreamCaptureUnsupported); } + if (size == 0) { + *ptr = NULL; + IREE_TRACE_ZONE_END(z0); + return hipSuccess; + } + hrx_mem_pool_t pool = NULL; hipError_t pool_result = iree_hip_current_mem_pool(context, &pool); if (pool_result != hipSuccess) { @@ -5297,16 +5303,32 @@ static hipError_t iree_hip_resolve_memcpy_kind( return hipSuccess; } +static hipError_t iree_hip_validate_hip_visible_memcpy_range( + iree_hal_streaming_buffer_ref_t range_ref, size_t count) { + if (!range_ref.buffer || count > (size_t)IREE_DEVICE_SIZE_MAX) { + return hipErrorInvalidValue; + } + const iree_device_size_t device_count = (iree_device_size_t)count; + const iree_device_size_t logical_size = range_ref.buffer->logical_size; + if (range_ref.offset > logical_size || + device_count > logical_size - range_ref.offset) { + return hipErrorInvalidValue; + } + return hipSuccess; +} + static hipError_t iree_hip_validate_known_memcpy_range( iree_hal_streaming_context_t* context, const void* ptr, size_t count, bool use_hip_visible_pool_size) { - (void)use_hip_visible_pool_size; if (count == 0) return hipSuccess; iree_hal_streaming_buffer_ref_t range_ref; iree_status_t status = iree_hal_streaming_memory_lookup_range( context, (iree_hal_streaming_deviceptr_t)ptr, count, &range_ref); if (iree_status_is_ok(status)) { + if (use_hip_visible_pool_size) { + return iree_hip_validate_hip_visible_memcpy_range(range_ref, count); + } return hipSuccess; } const iree_status_code_t code = iree_status_code(status); @@ -5319,8 +5341,12 @@ static hipError_t iree_hip_validate_known_memcpy_range( status = iree_hal_streaming_memory_lookup_range_across_contexts( (iree_hal_streaming_deviceptr_t)ptr, count, &owner_context, &range_ref); if (iree_status_is_ok(status)) { + hipError_t result = hipSuccess; + if (use_hip_visible_pool_size) { + result = iree_hip_validate_hip_visible_memcpy_range(range_ref, count); + } iree_hal_streaming_context_release(owner_context); - return hipSuccess; + return result; } if (iree_status_code(status) != IREE_STATUS_NOT_FOUND) { return iree_status_to_hip_result(status); @@ -5333,6 +5359,20 @@ static hipError_t iree_hip_validate_known_memcpy_range( if (iree_status_is_ok(status)) { return hipErrorInvalidValue; } + if (iree_status_code(status) != IREE_STATUS_NOT_FOUND) { + return iree_status_to_hip_result(status); + } + iree_status_ignore(status); + + status = iree_hal_streaming_memory_lookup_range_across_contexts( + (iree_hal_streaming_deviceptr_t)ptr, 1, &owner_context, &buffer_ref); + if (iree_status_is_ok(status)) { + iree_hal_streaming_context_release(owner_context); + return hipErrorInvalidValue; + } + if (iree_status_code(status) != IREE_STATUS_NOT_FOUND) { + return iree_status_to_hip_result(status); + } iree_status_ignore(status); return hipSuccess; } @@ -5669,8 +5709,8 @@ HIPAPI hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, IREE_TRACE_ZONE_END(z0); HIP_RETURN_ERROR(kind_result); } - hipError_t range_result = iree_hip_validate_memcpy_ranges( - context, dst, src, sizeBytes, kind, false); + hipError_t range_result = + iree_hip_validate_memcpy_ranges(context, dst, src, sizeBytes, kind, true); if (range_result != hipSuccess) { IREE_TRACE_ZONE_END(z0); HIP_RETURN_ERROR(range_result); @@ -5792,13 +5832,10 @@ HIPAPI hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, iree_hal_streaming_stream_t* stream_obj = NULL; hipError_t init_result = hipSuccess; if (stream && stream != hipStreamLegacy && stream != hipStreamPerThread) { - init_result = iree_hip_ensure_initialized(); - if (init_result == hipSuccess) { - stream_obj = (iree_hal_streaming_stream_t*)stream; - context = stream_obj->context; - if (!context) { - init_result = hipErrorContextIsDestroyed; - } + stream_obj = (iree_hal_streaming_stream_t*)stream; + context = stream_obj->context; + if (!context) { + init_result = hipErrorContextIsDestroyed; } } else { init_result = iree_hip_ensure_context(&context); @@ -5817,8 +5854,8 @@ HIPAPI hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, IREE_TRACE_ZONE_END(z0); HIP_RETURN_ERROR(kind_result); } - hipError_t range_result = iree_hip_validate_memcpy_ranges( - context, dst, src, sizeBytes, kind, false); + hipError_t range_result = + iree_hip_validate_memcpy_ranges(context, dst, src, sizeBytes, kind, true); if (range_result != hipSuccess) { IREE_TRACE_ZONE_END(z0); HIP_RETURN_ERROR(range_result); @@ -12426,7 +12463,6 @@ HIPAPI hipError_t hipPointerGetAttribute(void* data, break; } case HIP_POINTER_ATTRIBUTE_HOST_POINTER: { - // Return the host pointer if available. if (!buffer_ref.buffer->host_ptr) { result = hipErrorInvalidValue; break; @@ -12461,7 +12497,9 @@ HIPAPI hipError_t hipPointerGetAttribute(void* data, break; } case HIP_POINTER_ATTRIBUTE_MAPPED: { - *(unsigned int*)data = buffer_ref.buffer->host_ptr ? 1 : 0; + // This attribute describes whether the allocation has a runtime-visible + // device mapping, not whether it also has a host alias. + *(unsigned int*)data = 1; break; } case HIP_POINTER_ATTRIBUTE_SYNC_MEMOPS: { @@ -12711,13 +12749,6 @@ HIPAPI hipError_t hipPointerGetAttributes(hipPointerAttribute_t* attributes, IREE_TRACE_ZONE_END(z0); HIP_RETURN_ERROR(hipErrorInvalidValue); } - if (!ptr) { - IREE_TRACE_ZONE_END(z0); - HIP_RETURN_ERROR(hipErrorInvalidValue); - } - - // Initialize with defaults. - memset(attributes, 0, sizeof(*attributes)); // Get the context. iree_hal_streaming_context_t* context = NULL; @@ -12726,6 +12757,13 @@ HIPAPI hipError_t hipPointerGetAttributes(hipPointerAttribute_t* attributes, IREE_TRACE_ZONE_END(z0); HIP_RETURN_ERROR(init_result); } + if (!ptr) { + IREE_TRACE_ZONE_END(z0); + HIP_RETURN_ERROR(hipErrorInvalidValue); + } + + // Initialize with defaults. + memset(attributes, 0, sizeof(*attributes)); iree_hip_pointer_metadata_t metadata; hipError_t metadata_result =