Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 15 additions & 5 deletions libhrx/src/binding/common/event.c
Original file line number Diff line number Diff line change
Expand Up @@ -160,10 +160,11 @@ iree_status_t iree_hal_streaming_event_record(
IREE_RETURN_AND_END_ZONE_IF_ERROR(z0,
iree_hal_streaming_stream_flush(stream));

iree_slim_mutex_lock(&stream->mutex);

// Use stream's current pending value as wait value and increment for signal.
uint64_t wait_value = stream->pending_value;
event->signal_value = wait_value + 1;
stream->pending_value = event->signal_value;

// Create a queue barrier to signal the event semaphore.
// This waits for the stream's last submission to complete before signaling.
Expand All @@ -181,10 +182,19 @@ iree_status_t iree_hal_streaming_event_record(
.payload_values = signal_values,
};

IREE_RETURN_AND_END_ZONE_IF_ERROR(
z0, iree_hal_device_queue_barrier(
stream->context->device, stream->queue_affinity, wait_semaphores,
signal_semaphores, IREE_HAL_EXECUTE_FLAG_NONE));
iree_status_t status = iree_hal_device_queue_barrier(
stream->context->device, stream->queue_affinity, wait_semaphores,
signal_semaphores, IREE_HAL_EXECUTE_FLAG_NONE);
if (iree_status_is_ok(status)) {
status = iree_hal_device_queue_flush(stream->context->device,
stream->queue_affinity);
}
if (iree_status_is_ok(status)) {
stream->pending_value = event->signal_value;
stream->submitted_value = event->signal_value;
}
iree_slim_mutex_unlock(&stream->mutex);
IREE_RETURN_AND_END_ZONE_IF_ERROR(z0, status);

IREE_TRACE_ZONE_END(z0);
return iree_ok_status();
Expand Down
34 changes: 28 additions & 6 deletions libhrx/src/binding/common/graph.c
Original file line number Diff line number Diff line change
Expand Up @@ -1029,6 +1029,10 @@ static iree_status_t iree_hal_streaming_pack_raw_argument_list(
iree_host_size_t* out_constants_size) {
IREE_ASSERT_ARGUMENT(parameters);
IREE_ASSERT_ARGUMENT(out_constants_size);
if (iree_hal_streaming_parameter_info_is_empty(parameters)) {
*out_constants_size = 0;
return iree_ok_status();
}
*out_constants_size = parameters->direct_arg_bytes
? parameters->direct_arg_bytes
: parameters->constant_bytes;
Expand All @@ -1038,7 +1042,9 @@ static iree_status_t iree_hal_streaming_pack_raw_argument_list(
if (*out_constants_size == 0) {
return iree_ok_status();
}
if (!parameter_list || !out_constants) {
if (!out_constants || (!parameter_list && (parameters->buffer_size > 0 ||
parameters->binding_count > 0 ||
parameters->copy_count > 0))) {
return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
"raw kernel arguments require parameter storage");
}
Expand Down Expand Up @@ -1109,11 +1115,14 @@ iree_status_t iree_hal_streaming_graph_add_kernel_node(
(params->flags & IREE_HAL_STREAMING_DISPATCH_FLAG_ARGS_ARRAY) != 0;
const bool is_native_kernel = symbol->parameters.binding_count == 0 &&
symbol->parameters.copy_count == 0;
if (is_args_array && is_native_kernel && params->buffer) {
const bool is_empty_native_kernel =
is_native_kernel &&
iree_hal_streaming_parameter_info_is_empty(&symbol->parameters);
if (is_args_array && is_native_kernel && !is_empty_native_kernel) {
IREE_TRACE_ZONE_END(z0);
return iree_make_status(
IREE_STATUS_UNIMPLEMENTED,
"args-array graph kernel launch requires parameter metadata");
"non-empty args-array graph kernel launch requires parameter metadata");
}

iree_host_size_t constants_capacity = symbol->parameters.constant_bytes;
Expand Down Expand Up @@ -1180,7 +1189,9 @@ iree_status_t iree_hal_streaming_graph_add_kernel_node(
attrs->constants_capacity = constants_capacity;
attrs->bindings.count = symbol->parameters.binding_count;
attrs->bindings.values =
(iree_hal_buffer_ref_t*)(extra_data + constants_size);
symbol->parameters.binding_count
? (iree_hal_buffer_ref_t*)(extra_data + constants_size)
: NULL;
attrs->binding_capacity = symbol->parameters.binding_count;
iree_status_t unpack_status = iree_ok_status();
if (is_pre_packed && params->buffer) {
Expand All @@ -1195,6 +1206,10 @@ iree_status_t iree_hal_streaming_graph_add_kernel_node(
}
attrs->constants = iree_make_const_byte_span(constants, captured_size);
attrs->bindings = iree_hal_buffer_ref_list_empty();
} else if (is_args_array && is_empty_native_kernel) {
// HIP host stubs may pass a {NULL} args array for no-argument kernels.
attrs->constants = iree_make_const_byte_span(constants, 0);
attrs->bindings = iree_hal_buffer_ref_list_empty();
} else if (is_args_array) {
unpack_status = iree_hal_streaming_unpack_parameter_list(
graph->context, &symbol->parameters, (void**)params->buffer, constants,
Expand Down Expand Up @@ -1273,10 +1288,13 @@ iree_status_t iree_hal_streaming_graph_set_kernel_node_params(
(params->flags & IREE_HAL_STREAMING_DISPATCH_FLAG_ARGS_ARRAY) != 0;
const bool is_native_kernel = symbol->parameters.binding_count == 0 &&
symbol->parameters.copy_count == 0;
if (is_args_array && is_native_kernel && params->buffer) {
const bool is_empty_native_kernel =
is_native_kernel &&
iree_hal_streaming_parameter_info_is_empty(&symbol->parameters);
if (is_args_array && is_native_kernel && !is_empty_native_kernel) {
return iree_make_status(
IREE_STATUS_UNIMPLEMENTED,
"args-array graph kernel launch requires parameter metadata");
"non-empty args-array graph kernel launch requires parameter metadata");
}

iree_host_size_t constants_capacity = symbol->parameters.constant_bytes;
Expand Down Expand Up @@ -1316,6 +1334,10 @@ iree_status_t iree_hal_streaming_graph_set_kernel_node_params(
}
constants_span = iree_make_const_byte_span(constants, captured_size);
bindings = iree_hal_buffer_ref_list_empty();
} else if (is_args_array && is_empty_native_kernel) {
// HIP host stubs may pass a {NULL} args array for no-argument kernels.
constants_span = iree_make_const_byte_span(constants, 0);
bindings = iree_hal_buffer_ref_list_empty();
} else if (is_args_array) {
unpack_status = iree_hal_streaming_unpack_parameter_list(
node->graph->context, &symbol->parameters, (void**)params->buffer,
Expand Down
9 changes: 9 additions & 0 deletions libhrx/src/binding/common/internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -603,6 +603,15 @@ typedef struct iree_hal_streaming_parameter_info_t {
iree_hal_streaming_parameter_op_t* ops;
} iree_hal_streaming_parameter_info_t;

// True when launch metadata describes no parameters in either HAL binding form
// or native direct-argument form.
static inline bool iree_hal_streaming_parameter_info_is_empty(
const iree_hal_streaming_parameter_info_t* parameters) {
return parameters->buffer_size == 0 && parameters->constant_bytes == 0 &&
parameters->direct_arg_bytes == 0 && parameters->binding_count == 0 &&
parameters->copy_count == 0;
}

// Symbol metadata structure.
typedef struct iree_hal_streaming_symbol_t {
// Parent module. Unowned.
Expand Down
22 changes: 3 additions & 19 deletions libhrx/src/binding/common/memory.c
Original file line number Diff line number Diff line change
Expand Up @@ -907,27 +907,14 @@ static iree_status_t iree_hal_streaming_memory_allocate_host_with_context_mode(
.min_alignment = host_alignment,
};

void* host_ptr = NULL;
IREE_RETURN_AND_END_ZONE_IF_ERROR(
z0, iree_allocator_malloc_aligned(context->host_allocator,
allocation_size, host_alignment,
/*offset=*/0, &host_ptr));

iree_hal_buffer_t* buffer = NULL;
iree_hal_external_buffer_t external_buffer = {
.type = IREE_HAL_EXTERNAL_BUFFER_TYPE_HOST_ALLOCATION,
.flags = IREE_HAL_EXTERNAL_BUFFER_FLAG_NONE,
.size = (iree_device_size_t)allocation_size,
.handle.host_allocation.ptr = host_ptr,
};
iree_status_t status = iree_hal_allocator_import_buffer(
context->device_allocator, params, &external_buffer,
iree_hal_buffer_release_callback_null(), &buffer);
iree_status_t status = iree_hal_allocator_allocate_buffer(
context->device_allocator, params, allocation_size, &buffer);

iree_hal_streaming_buffer_t* wrapper = NULL;
if (iree_status_is_ok(status)) {
status = iree_hal_streaming_buffer_wrap(
context, buffer, (int)memory_type, host_ptr,
context, buffer, (int)memory_type, /*imported_host_ptr=*/NULL,
/*allocation_pool=*/NULL, context_ownership, &wrapper);
}
iree_hal_buffer_release(buffer);
Expand All @@ -939,17 +926,14 @@ static iree_status_t iree_hal_streaming_memory_allocate_host_with_context_mode(
}

if (iree_status_is_ok(status)) {
wrapper->owns_host_ptr = true;
wrapper->imported_host_allocation = false;
wrapper->host_register_flags = flags;
*out_buffer = wrapper;
host_ptr = NULL;
} else {
if (wrapper) {
hrx_buffer_table_remove(&context->buffer_table, wrapper->device_ptr);
iree_hal_streaming_buffer_free(wrapper);
}
iree_allocator_free_aligned(context->host_allocator, host_ptr);
}
IREE_TRACE_ZONE_END(z0);
return status;
Expand Down
66 changes: 47 additions & 19 deletions libhrx/src/binding/common/module.c
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,7 @@ static iree_status_t iree_hal_streaming_module_extract_metadata(
for (iree_host_size_t i = 0, parameter_base = 0;
iree_status_is_ok(status) && i < module->symbol_count; ++i) {
iree_hal_streaming_symbol_t* symbol = &module->symbols[i];
memset(symbol, 0, sizeof(*symbol));
symbol->module = module;
symbol->name = export_infos[i].name;
symbol->type = IREE_HAL_STREAMING_SYMBOL_TYPE_FUNCTION;
Expand All @@ -272,6 +273,12 @@ static iree_status_t iree_hal_streaming_module_extract_metadata(

// Initialize parameter info.
iree_hal_streaming_parameter_info_t* parameter_info = &symbol->parameters;
if (export_infos[i].constant_byte_length > UINT16_MAX) {
status = iree_make_status(
IREE_STATUS_OUT_OF_RANGE,
"function constant metadata exceeds supported parameter size");
continue;
}
// Executable binding_count describes normal HAL dispatch bindings. HRX's
// unpacker needs the number of reflected BINDING parameters it will
// resolve from the HIP launch ABI.
Expand All @@ -289,14 +296,16 @@ static iree_status_t iree_hal_streaming_module_extract_metadata(
// Build operations with coalescing.
// Copy ops go first, then resolve ops.
uint16_t src_offset = 0;
size_t direct_arg_offset = 0;
uint16_t buffer_size = 0;
size_t this_kernel_direct_arg_size = 0; // Native direct-arg prefix size.
iree_hal_streaming_parameter_op_t* copy_ops_start = current_ops;
iree_hal_streaming_parameter_op_t* resolve_ops_start =
current_ops + symbol_op_counts[i].copy_count;
uint16_t copy_count = 0;
uint16_t resolve_count = 0;
for (uint16_t j = 0; j < parameter_count; ++j) {
for (uint16_t j = 0; iree_status_is_ok(status) && j < parameter_count;
++j) {
const iree_hal_executable_export_parameter_t* parameter =
&parameters[parameter_base + j];
const bool is_binding_parameter =
Expand All @@ -305,6 +314,19 @@ static iree_status_t iree_hal_streaming_module_extract_metadata(
parameter->type ==
IREE_HAL_EXECUTABLE_EXPORT_PARAMETER_TYPE_BUFFER_PTR &&
resolve_count < export_infos[i].binding_count;
size_t native_dst_offset = direct_arg_offset;
if (is_buffer_binding_parameter) {
native_dst_offset = parameter->offset;
}
const size_t source_extent = (size_t)src_offset + parameter->size;
const size_t native_extent = native_dst_offset + parameter->size;
if (source_extent > UINT16_MAX || native_dst_offset > UINT16_MAX ||
native_extent > UINT16_MAX) {
status = iree_make_status(
IREE_STATUS_OUT_OF_RANGE,
"function parameter metadata exceeds supported argument size");
break;
}
if (is_binding_parameter || is_buffer_binding_parameter) {
// Update offsets. Bindings are passed as pointers.
// |parameter->offset| is the kernarg byte offset for all parameter
Expand All @@ -314,25 +336,27 @@ static iree_status_t iree_hal_streaming_module_extract_metadata(
// exactly the index of this parameter in the bindings list.
iree_hal_streaming_parameter_resolve_op_t* op =
&resolve_ops_start[resolve_count].resolve;
op->reserved = 0;
op->src_offset = src_offset;
op->dst_ordinal = resolve_count;
op->src_ordinal = j;
// For HIP/CUDA native launches using CUSTOM_DIRECT_ARGUMENTS we need
// For HIP native launches using CUSTOM_DIRECT_ARGUMENTS we need
// to place raw device pointers at their kernarg ABI offset. Binding
// export parameter offsets are binding-list ordinals in some IREE HAL
// backends, not byte offsets, so use the packed source offset we
// calculate from the full parameter sequence. AMDGPU BUFFER_PTR
// parameters already carry native kernarg byte offsets.
op->dst_offset =
is_buffer_binding_parameter ? parameter->offset : src_offset;
src_offset += parameter->size;
op->dst_offset = (uint16_t)native_dst_offset;
src_offset = (uint16_t)source_extent;
buffer_size = src_offset;
++resolve_count;

size_t param_extent = (size_t)op->dst_offset + parameter->size;
size_t param_extent = native_extent;
if (param_extent > this_kernel_direct_arg_size) {
this_kernel_direct_arg_size = param_extent;
}
direct_arg_offset =
iree_max(param_extent, direct_arg_offset + parameter->size);
} else {
// TODO: fix coalescing. It does not work when we have
// parameter arrays because each constant comes in as a
Expand All @@ -349,27 +373,31 @@ static iree_status_t iree_hal_streaming_module_extract_metadata(
op->size = parameter->size;
op->src_offset = src_offset;
op->src_ordinal = j;
op->direct_dst_offset = src_offset;
op->direct_dst_offset = (uint16_t)native_dst_offset;
op->dst_offset = parameter->offset; // offset in constants
++copy_count;
// active_copy = op;
// }
src_offset += parameter->size;
src_offset = (uint16_t)source_extent;
buffer_size = src_offset;

size_t direct_arg_extent =
(size_t)op->direct_dst_offset + parameter->size;
size_t direct_arg_extent = native_extent;
if (direct_arg_extent > this_kernel_direct_arg_size) {
this_kernel_direct_arg_size = direct_arg_extent;
}
direct_arg_offset =
iree_max(direct_arg_extent, direct_arg_offset + parameter->size);
}
}
parameter_info->buffer_size = buffer_size;
parameter_info->constant_bytes = export_infos[i].constant_byte_length;
if (buffer_size > this_kernel_direct_arg_size) {
this_kernel_direct_arg_size = buffer_size;
if (iree_status_is_ok(status)) {
parameter_info->buffer_size = buffer_size;
parameter_info->constant_bytes =
(uint16_t)export_infos[i].constant_byte_length;
if (buffer_size > this_kernel_direct_arg_size) {
this_kernel_direct_arg_size = buffer_size;
}
parameter_info->direct_arg_bytes = (uint16_t)this_kernel_direct_arg_size;
}
parameter_info->direct_arg_bytes = this_kernel_direct_arg_size;

// Advance to next symbol's ops.
parameter_base += parameter_count;
Expand Down Expand Up @@ -412,11 +440,11 @@ iree_status_t iree_hal_streaming_module_create_from_memory(
module->cache = context->executable_cache;
iree_hal_executable_cache_retain(module->cache);

// HIP / CUDA hand us anything the toolchain emits — raw AMDGPU ELFs,
// HIP toolchains hand us several container formats: raw AMDGPU ELFs,
// __CLANG_OFFLOAD_BUNDLE__ archives, CCOB (zstd-compressed bundles), and
// __hipFatBinaryWrapper-wrapped combinations of all of the above. Unwrap
// everything here and only forward raw ELF plus an explicit executable
// format to the HAL executable cache.
// __hipFatBinaryWrapper-wrapped combinations of those. Unwrap everything here
// and only forward raw ELF plus an explicit executable format to the HAL
// executable cache.
iree_const_byte_span_t executable_data = image;
const char* executable_format = NULL;
const bool try_fat_unwrap = context->device_entry != NULL &&
Expand Down
Loading
Loading