| // Copyright 2026 The IREE Authors |
| // |
| // Licensed under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| |
| #include "iree/hal/drivers/amdgpu/host_queue.h" |
| |
| #include <stdio.h> |
| #include <string.h> |
| |
| #include "iree/async/frontier_tracker.h" |
| #include "iree/async/notification.h" |
| #include "iree/base/threading/thread.h" |
| #include "iree/hal/drivers/amdgpu/host_queue_blit.h" |
| #include "iree/hal/drivers/amdgpu/host_queue_command_buffer.h" |
| #include "iree/hal/drivers/amdgpu/host_queue_command_buffer_scratch.h" |
| #include "iree/hal/drivers/amdgpu/host_queue_dispatch.h" |
| #include "iree/hal/drivers/amdgpu/host_queue_file.h" |
| #include "iree/hal/drivers/amdgpu/host_queue_host_call.h" |
| #include "iree/hal/drivers/amdgpu/host_queue_memory.h" |
| #include "iree/hal/drivers/amdgpu/host_queue_pending.h" |
| #include "iree/hal/drivers/amdgpu/host_queue_policy.h" |
| #include "iree/hal/drivers/amdgpu/host_queue_profile.h" |
| #include "iree/hal/drivers/amdgpu/host_queue_profile_events.h" |
| #include "iree/hal/drivers/amdgpu/host_queue_submission.h" |
| #include "iree/hal/drivers/amdgpu/host_queue_waits.h" |
| #include "iree/hal/drivers/amdgpu/semaphore.h" |
| #include "iree/hal/drivers/amdgpu/transient_buffer.h" |
| #include "iree/hal/drivers/amdgpu/util/pm4_emitter.h" |
| #include "iree/hal/utils/resource_set.h" |
| |
| static const iree_hal_amdgpu_virtual_queue_vtable_t |
| iree_hal_amdgpu_host_queue_vtable; |
| |
| static iree_status_t iree_hal_amdgpu_host_queue_allocate_pm4_ib_slots( |
| const iree_hal_amdgpu_libhsa_t* libhsa, hsa_agent_t gpu_agent, |
| hsa_amd_memory_pool_t pm4_ib_pool, uint32_t aql_queue_capacity, |
| iree_hal_amdgpu_host_queue_t* out_queue) { |
| IREE_TRACE_ZONE_BEGIN(z0); |
| IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, aql_queue_capacity); |
| iree_host_size_t pm4_ib_size = 0; |
| IREE_RETURN_AND_END_ZONE_IF_ERROR( |
| z0, IREE_STRUCT_LAYOUT( |
| 0, &pm4_ib_size, |
| IREE_STRUCT_FIELD(aql_queue_capacity, |
| iree_hal_amdgpu_pm4_ib_slot_t, NULL))); |
| if (IREE_UNLIKELY(!pm4_ib_pool.handle)) { |
| IREE_RETURN_AND_END_ZONE_IF_ERROR( |
| z0, iree_make_status(IREE_STATUS_INVALID_ARGUMENT, |
| "PM4 IB memory pool is required")); |
| } |
| IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, pm4_ib_size); |
| iree_hal_amdgpu_pm4_ib_slot_t* pm4_ib_slots = NULL; |
| IREE_RETURN_AND_END_ZONE_IF_ERROR( |
| z0, iree_hsa_amd_memory_pool_allocate( |
| IREE_LIBHSA(libhsa), pm4_ib_pool, pm4_ib_size, |
| HSA_AMD_MEMORY_POOL_EXECUTABLE_FLAG, (void**)&pm4_ib_slots)); |
| iree_status_t status = iree_hsa_amd_agents_allow_access( |
| IREE_LIBHSA(libhsa), /*num_agents=*/1, &gpu_agent, /*flags=*/NULL, |
| pm4_ib_slots); |
| if (iree_status_is_ok(status)) { |
| memset(pm4_ib_slots, 0, pm4_ib_size); |
| out_queue->pm4_ib_slots = pm4_ib_slots; |
| } else { |
| status = iree_status_join(status, iree_hsa_amd_memory_pool_free( |
| IREE_LIBHSA(libhsa), pm4_ib_slots)); |
| } |
| IREE_TRACE_ZONE_END(z0); |
| return status; |
| } |
| |
| static void iree_hal_amdgpu_host_queue_reclaim_retired( |
| iree_hal_amdgpu_reclaim_entry_t* entry, uint64_t epoch, void* user_data) { |
| (void)epoch; |
| iree_hal_amdgpu_host_queue_t* queue = |
| (iree_hal_amdgpu_host_queue_t*)user_data; |
| iree_hal_amdgpu_profile_dispatch_event_reservation_t reservation = { |
| .first_event_position = entry->profile_event_first_position, |
| .event_count = entry->profile_event_count, |
| }; |
| iree_hal_amdgpu_host_queue_retire_profile_dispatch_events(queue, reservation); |
| iree_hal_amdgpu_profile_queue_device_event_reservation_t |
| queue_device_reservation = { |
| .first_event_position = entry->queue_device_event_first_position, |
| .event_count = entry->queue_device_event_count, |
| }; |
| iree_hal_amdgpu_host_queue_retire_profile_queue_device_events( |
| queue, queue_device_reservation); |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // Initialization / deinitialization |
| //===----------------------------------------------------------------------===// |
| |
| void iree_hal_amdgpu_host_queue_enqueue_post_drain_action( |
| iree_hal_amdgpu_host_queue_t* queue, |
| iree_hal_amdgpu_host_queue_post_drain_action_t* action, |
| iree_hal_amdgpu_host_queue_post_drain_fn_t fn, void* user_data) { |
| action->next = NULL; |
| action->fn = fn; |
| action->user_data = user_data; |
| |
| iree_slim_mutex_lock(&queue->locks.post_drain_mutex); |
| if (queue->post_drain.tail) { |
| queue->post_drain.tail->next = action; |
| } else { |
| queue->post_drain.head = action; |
| } |
| queue->post_drain.tail = action; |
| iree_slim_mutex_unlock(&queue->locks.post_drain_mutex); |
| } |
| |
| static void iree_hal_amdgpu_host_queue_run_post_drain_actions( |
| iree_hal_amdgpu_host_queue_t* queue) { |
| iree_slim_mutex_lock(&queue->locks.post_drain_mutex); |
| iree_hal_amdgpu_host_queue_post_drain_action_t* action = |
| queue->post_drain.head; |
| queue->post_drain.head = NULL; |
| queue->post_drain.tail = NULL; |
| iree_slim_mutex_unlock(&queue->locks.post_drain_mutex); |
| |
| while (action) { |
| iree_hal_amdgpu_host_queue_post_drain_action_t* next_action = action->next; |
| action->next = NULL; |
| action->fn(action->user_data); |
| action = next_action; |
| } |
| } |
| |
| // Drains completed notification entries and reclaims kernarg space. If the GPU |
| // queue has faulted (error_status is set), fails all pending entries instead of |
| // draining normally. |
| static iree_host_size_t iree_hal_amdgpu_host_queue_drain_completions( |
| iree_hal_amdgpu_host_queue_t* queue) { |
| // Check for GPU queue error (set by the HSA error callback on another |
| // thread). If the queue has faulted, no further epochs will advance; |
| // fail all pending entries so waiters get the actual GPU error instead |
| // of hanging or timing out. |
| iree_status_t error = (iree_status_t)iree_atomic_load( |
| &queue->error_status, iree_memory_order_acquire); |
| const uint64_t previous_epoch = (uint64_t)iree_atomic_load( |
| &queue->notification_ring.epoch.last_drained, iree_memory_order_relaxed); |
| uint64_t kernarg_reclaim_position = 0; |
| iree_host_size_t count = 0; |
| if (IREE_UNLIKELY(error)) { |
| count = iree_hal_amdgpu_notification_ring_fail_all( |
| &queue->notification_ring, error, &kernarg_reclaim_position); |
| iree_hal_amdgpu_host_queue_clear_profile_events(queue); |
| iree_async_frontier_tracker_fail_axis( |
| queue->frontier_tracker, queue->axis, |
| iree_status_from_code(iree_status_code(error))); |
| } else { |
| count = iree_hal_amdgpu_notification_ring_drain( |
| &queue->notification_ring, |
| /*fallback_frontier=*/NULL, iree_hal_amdgpu_host_queue_reclaim_retired, |
| queue, &kernarg_reclaim_position); |
| const uint64_t current_epoch = |
| (uint64_t)iree_atomic_load(&queue->notification_ring.epoch.last_drained, |
| iree_memory_order_acquire); |
| if (current_epoch > previous_epoch) { |
| iree_async_frontier_tracker_advance(queue->frontier_tracker, queue->axis, |
| current_epoch); |
| } |
| } |
| if (kernarg_reclaim_position > 0) { |
| iree_hal_amdgpu_kernarg_ring_reclaim(&queue->kernarg_ring, |
| kernarg_reclaim_position); |
| } |
| iree_hal_amdgpu_host_queue_run_post_drain_actions(queue); |
| return count; |
| } |
| |
| static bool iree_hal_amdgpu_host_queue_has_error( |
| iree_hal_amdgpu_host_queue_t* queue) { |
| return iree_atomic_load(&queue->error_status, iree_memory_order_acquire) != 0; |
| } |
| |
| static bool iree_hal_amdgpu_host_queue_store_error( |
| iree_hal_amdgpu_host_queue_t* queue, iree_status_t error) { |
| intptr_t expected = 0; |
| if (iree_atomic_compare_exchange_strong( |
| &queue->error_status, &expected, (intptr_t)error, |
| iree_memory_order_release, iree_memory_order_acquire)) { |
| return true; |
| } |
| iree_status_free(error); |
| return false; |
| } |
| |
| static void iree_hal_amdgpu_host_queue_request_completion_thread_stop( |
| iree_hal_amdgpu_host_queue_t* queue) { |
| if (queue->completion.stop_signal.handle) { |
| iree_hsa_signal_store_screlease(IREE_LIBHSA(queue->libhsa), |
| queue->completion.stop_signal, 1); |
| } |
| } |
| |
| static hsa_signal_value_t iree_hal_amdgpu_host_queue_last_drained_signal_value( |
| iree_hal_amdgpu_host_queue_t* queue) { |
| const uint64_t last_drained_epoch = (uint64_t)iree_atomic_load( |
| &queue->notification_ring.epoch.last_drained, iree_memory_order_acquire); |
| return (hsa_signal_value_t)(IREE_HAL_AMDGPU_EPOCH_INITIAL_VALUE - |
| last_drained_epoch); |
| } |
| |
| // Completion thread entry point. Blocks in HSA until either the queue epoch |
| // signal changes or teardown/error signals the stop signal. Completion wakeups |
| // drain normally; stop/error wakeups perform one final drain/fail before exit. |
| static int iree_hal_amdgpu_host_queue_completion_thread_main(void* entry_arg) { |
| { |
| IREE_TRACE_ZONE_BEGIN_NAMED( |
| z0, "iree_hal_amdgpu_host_queue_completion_thread_start"); |
| IREE_TRACE_ZONE_END(z0); |
| } |
| iree_hal_amdgpu_host_queue_t* queue = |
| (iree_hal_amdgpu_host_queue_t*)entry_arg; |
| |
| enum { |
| IREE_HAL_AMDGPU_COMPLETION_WAIT_EPOCH_SIGNAL = 0, |
| IREE_HAL_AMDGPU_COMPLETION_WAIT_STOP_SIGNAL = 1, |
| IREE_HAL_AMDGPU_COMPLETION_WAIT_SIGNAL_COUNT = 2, |
| }; |
| |
| hsa_signal_t epoch_signal = |
| iree_hal_amdgpu_notification_ring_epoch_signal(&queue->notification_ring); |
| hsa_signal_t stop_signal = queue->completion.stop_signal; |
| hsa_signal_value_t last_epoch_value = |
| iree_hal_amdgpu_host_queue_last_drained_signal_value(queue); |
| |
| bool keep_running = true; |
| while (keep_running) { |
| hsa_signal_t signals[IREE_HAL_AMDGPU_COMPLETION_WAIT_SIGNAL_COUNT] = { |
| epoch_signal, |
| stop_signal, |
| }; |
| hsa_signal_condition_t |
| conditions[IREE_HAL_AMDGPU_COMPLETION_WAIT_SIGNAL_COUNT] = { |
| HSA_SIGNAL_CONDITION_NE, |
| HSA_SIGNAL_CONDITION_NE, |
| }; |
| hsa_signal_value_t values[IREE_HAL_AMDGPU_COMPLETION_WAIT_SIGNAL_COUNT] = { |
| last_epoch_value, |
| 0, |
| }; |
| const uint32_t signal_index = iree_hsa_amd_signal_wait_any( |
| IREE_LIBHSA(queue->libhsa), |
| IREE_HAL_AMDGPU_COMPLETION_WAIT_SIGNAL_COUNT, signals, conditions, |
| values, UINT64_MAX, HSA_WAIT_STATE_BLOCKED, |
| /*satisfying_value=*/NULL); |
| |
| { |
| IREE_TRACE_ZONE_BEGIN_NAMED( |
| z0, "iree_hal_amdgpu_host_queue_completion_thread_pump"); |
| IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, signal_index); |
| |
| if (signal_index == IREE_HAL_AMDGPU_COMPLETION_WAIT_EPOCH_SIGNAL) { |
| iree_hal_amdgpu_host_queue_drain_completions(queue); |
| // Arm the next wait from the epoch we actually drained, not from a raw |
| // HSA signal load. A GPU completion can race with the drain and update |
| // the signal after drain() sampled it; observing that newer value here |
| // would mark an undrained epoch as already seen and could sleep forever |
| // with a user semaphore still pending. |
| last_epoch_value = |
| iree_hal_amdgpu_host_queue_last_drained_signal_value(queue); |
| } |
| |
| if (signal_index == IREE_HAL_AMDGPU_COMPLETION_WAIT_STOP_SIGNAL || |
| iree_hal_amdgpu_host_queue_has_error(queue)) { |
| iree_hal_amdgpu_host_queue_drain_completions(queue); |
| keep_running = false; |
| } else if (IREE_UNLIKELY(signal_index >= |
| IREE_HAL_AMDGPU_COMPLETION_WAIT_SIGNAL_COUNT)) { |
| iree_status_t error = iree_make_status( |
| IREE_STATUS_INTERNAL, |
| "hsa_amd_signal_wait_any returned invalid signal index %u", |
| signal_index); |
| iree_hal_amdgpu_host_queue_store_error(queue, error); |
| iree_hal_amdgpu_host_queue_drain_completions(queue); |
| keep_running = false; |
| } |
| |
| IREE_TRACE_ZONE_END(z0); |
| } |
| } |
| |
| { |
| IREE_TRACE_ZONE_BEGIN_NAMED( |
| z0, "iree_hal_amdgpu_host_queue_completion_thread_exit"); |
| IREE_TRACE_ZONE_END(z0); |
| } |
| return 0; |
| } |
| |
| // HSA queue error callback. Called by the HSA runtime (on an internal thread) |
| // when the queue encounters an unrecoverable error (page fault, invalid AQL |
| // packet, ECC error). Stores the error atomically on the queue so the |
| // completion thread can fail pending semaphores with the actual GPU error. |
| static void iree_hal_amdgpu_host_queue_error_callback(hsa_status_t status, |
| hsa_queue_t* source, |
| void* data) { |
| iree_hal_amdgpu_host_queue_t* queue = (iree_hal_amdgpu_host_queue_t*)data; |
| |
| // Convert the HSA error to an IREE status with diagnostic information. |
| iree_status_t error = iree_status_from_hsa_status( |
| __FILE__, __LINE__, status, "hsa_queue_error_callback", |
| "GPU queue encountered an unrecoverable error"); |
| |
| // First-error-wins: store the error with release semantics so the status |
| // payload (heap-allocated string, backtrace) is visible to any thread that |
| // loads with acquire. If another error already won the race, free ours. |
| if (iree_hal_amdgpu_host_queue_store_error(queue, error)) { |
| iree_hal_amdgpu_host_queue_request_completion_thread_stop(queue); |
| } |
| } |
| |
| iree_status_t iree_hal_amdgpu_host_queue_initialize( |
| const iree_hal_amdgpu_libhsa_t* libhsa, iree_hal_device_t* logical_device, |
| iree_async_proactor_t* proactor, hsa_agent_t gpu_agent, |
| const iree_hal_amdgpu_kernarg_ring_memory_t* kernarg_memory, |
| hsa_amd_memory_pool_t pm4_ib_pool, |
| iree_async_frontier_tracker_t* frontier_tracker, iree_async_axis_t axis, |
| iree_hal_queue_affinity_t queue_affinity, |
| iree_thread_affinity_t completion_thread_affinity, |
| iree_hal_amdgpu_wait_barrier_strategy_t wait_barrier_strategy, |
| iree_hal_amdgpu_vendor_packet_capability_flags_t vendor_packet_capabilities, |
| iree_hal_amdgpu_epoch_signal_table_t* epoch_table, |
| iree_arena_block_pool_t* block_pool, |
| iree_hal_amdgpu_block_pool_t* profiling_signal_block_pool, |
| const iree_hal_amdgpu_device_buffer_transfer_context_t* transfer_context, |
| const iree_hal_pool_set_t* default_pool_set, iree_hal_pool_t* default_pool, |
| iree_hal_amdgpu_transient_buffer_pool_t* transient_buffer_pool, |
| iree_hal_amdgpu_staging_pool_t* staging_pool, |
| iree_host_size_t device_ordinal, uint32_t aql_queue_capacity, |
| uint32_t notification_capacity, uint32_t kernarg_capacity_in_blocks, |
| iree_allocator_t host_allocator, iree_hal_amdgpu_host_queue_t* out_queue) { |
| IREE_ASSERT_ARGUMENT(libhsa); |
| IREE_ASSERT_ARGUMENT(logical_device); |
| IREE_ASSERT_ARGUMENT(proactor); |
| IREE_ASSERT_ARGUMENT(kernarg_memory); |
| IREE_ASSERT_ARGUMENT(frontier_tracker); |
| IREE_ASSERT_ARGUMENT(epoch_table); |
| IREE_ASSERT_ARGUMENT(block_pool); |
| IREE_ASSERT_ARGUMENT(profiling_signal_block_pool); |
| IREE_ASSERT_ARGUMENT(transfer_context); |
| IREE_ASSERT_ARGUMENT(default_pool_set); |
| IREE_ASSERT_ARGUMENT(default_pool); |
| IREE_ASSERT_ARGUMENT(transient_buffer_pool); |
| IREE_ASSERT_ARGUMENT(out_queue); |
| |
| if (!iree_host_size_is_power_of_two(aql_queue_capacity) || |
| !iree_host_size_is_power_of_two(notification_capacity) || |
| !iree_host_size_is_power_of_two(kernarg_capacity_in_blocks)) { |
| return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, |
| "all capacities must be powers of two"); |
| } |
| if (kernarg_capacity_in_blocks / 2u < aql_queue_capacity) { |
| return iree_make_status( |
| IREE_STATUS_INVALID_ARGUMENT, |
| "kernarg ring capacity must be at least 2x the AQL ring capacity " |
| "to cover one tail-padding gap at wrap (got kernarg_blocks=%u, " |
| "aql_packets=%u)", |
| kernarg_capacity_in_blocks, aql_queue_capacity); |
| } |
| |
| IREE_TRACE_ZONE_BEGIN(z0); |
| |
| memset(out_queue, 0, sizeof(*out_queue)); |
| out_queue->base.vtable = &iree_hal_amdgpu_host_queue_vtable; |
| out_queue->libhsa = libhsa; |
| out_queue->logical_device = logical_device; |
| out_queue->proactor = proactor; |
| out_queue->frontier_tracker = frontier_tracker; |
| out_queue->host_allocator = host_allocator; |
| |
| // Submission pipeline state. |
| iree_slim_mutex_initialize(&out_queue->locks.submission_mutex); |
| iree_slim_mutex_initialize(&out_queue->locks.post_drain_mutex); |
| iree_slim_mutex_initialize(&out_queue->profiling.event_mutex); |
| out_queue->profiling.signals.block_pool = profiling_signal_block_pool; |
| out_queue->axis = axis; |
| out_queue->wait_barrier_strategy = wait_barrier_strategy; |
| out_queue->vendor_packet_capabilities = vendor_packet_capabilities; |
| out_queue->queue_affinity = queue_affinity; |
| out_queue->last_signal.semaphore = NULL; |
| out_queue->last_signal.epoch = 0; |
| out_queue->block_pool = block_pool; |
| out_queue->can_publish_frontier = true; |
| out_queue->transfer_context = transfer_context; |
| out_queue->default_pool_set = default_pool_set; |
| out_queue->default_pool = default_pool; |
| out_queue->transient_buffer_pool = transient_buffer_pool; |
| out_queue->staging_pool = staging_pool; |
| out_queue->device_ordinal = device_ordinal; |
| out_queue->pending_head = NULL; |
| iree_async_frontier_initialize(iree_hal_amdgpu_host_queue_frontier(out_queue), |
| /*entry_count=*/0); |
| |
| // The optional tracker semaphore is an iree_async_semaphore_t bridge for |
| // CPU-side wait integration. The queue's GPU-visible HSA epoch signal is |
| // created by the notification ring below and registered in the epoch table. |
| iree_status_t status = iree_async_frontier_tracker_register_axis( |
| frontier_tracker, axis, /*semaphore=*/NULL); |
| |
| // Create the host-only stop signal before the hardware queue so the HSA error |
| // callback always has a valid signal to wake if queue creation races with an |
| // asynchronous fault. |
| if (iree_status_is_ok(status)) { |
| status = iree_hsa_amd_signal_create( |
| IREE_LIBHSA(libhsa), /*initial_value=*/0, |
| /*num_consumers=*/0, /*consumers=*/NULL, /*attributes=*/0, |
| &out_queue->completion.stop_signal); |
| } |
| |
| // Create the HSA hardware AQL queue. |
| // |
| // HSA_QUEUE_TYPE_MULTI is required (not just an optimization). Once command |
| // buffers start performing device-side enqueue, the CP itself becomes a |
| // concurrent producer alongside the host submission path, so the queue must |
| // permit multiple concurrent producers. The host-side reserve already uses |
| // an atomic fetch_add on the write index, which is well-defined only on |
| // MULTI queues. |
| hsa_queue_t* hardware_queue = NULL; |
| if (iree_status_is_ok(status)) { |
| status = iree_hsa_queue_create( |
| IREE_LIBHSA(libhsa), gpu_agent, aql_queue_capacity, |
| HSA_QUEUE_TYPE_MULTI, iree_hal_amdgpu_host_queue_error_callback, |
| /*data=*/out_queue, |
| /*private_segment_size=*/UINT32_MAX, |
| /*group_segment_size=*/UINT32_MAX, &hardware_queue); |
| } |
| |
| // Initialize the AQL ring from the hardware queue. |
| if (iree_status_is_ok(status)) { |
| out_queue->hardware_queue = hardware_queue; |
| iree_hal_amdgpu_aql_ring_initialize((iree_amd_queue_t*)hardware_queue, |
| &out_queue->aql_ring); |
| } |
| |
| // Initialize the kernarg ring from the selected HSA memory pool. |
| if (iree_status_is_ok(status)) { |
| status = iree_hal_amdgpu_kernarg_ring_initialize(libhsa, kernarg_memory, |
| kernarg_capacity_in_blocks, |
| &out_queue->kernarg_ring); |
| } |
| |
| // Initialize the optional PM4 IB slot buffer. Capability-driven allocation |
| // keeps dynamic PM4 storage available on CDNA queues that use BARRIER_VALUE |
| // for waits but still support AQL PM4-IB snippets for other features. The |
| // buffer is indexed by AQL packet id and inherits AQL ring |
| // backpressure/reuse; there is no separate PM4 producer or reclaim position. |
| if (iree_status_is_ok(status) && |
| (vendor_packet_capabilities & |
| IREE_HAL_AMDGPU_VENDOR_PACKET_CAPABILITY_AQL_PM4_IB)) { |
| status = iree_hal_amdgpu_host_queue_allocate_pm4_ib_slots( |
| libhsa, gpu_agent, pm4_ib_pool, aql_queue_capacity, out_queue); |
| } |
| |
| // Initialize the notification ring (creates epoch signal + entry buffer). |
| if (iree_status_is_ok(status)) { |
| status = iree_hal_amdgpu_notification_ring_initialize( |
| libhsa, block_pool, notification_capacity, host_allocator, |
| &out_queue->notification_ring); |
| } |
| |
| // Register this queue's epoch signal in the shared table for cross-queue |
| // barrier emission lookups. Must happen after notification ring init (which |
| // creates the epoch signal) and before any submissions. |
| if (iree_status_is_ok(status)) { |
| iree_hal_amdgpu_epoch_signal_table_register( |
| epoch_table, iree_async_axis_device_index(axis), |
| iree_async_axis_queue_index(axis), |
| iree_hal_amdgpu_notification_ring_epoch_signal( |
| &out_queue->notification_ring)); |
| out_queue->epoch_table = epoch_table; |
| } |
| |
| if (iree_status_is_ok(status)) { |
| iree_thread_create_params_t thread_params; |
| memset(&thread_params, 0, sizeof(thread_params)); |
| char thread_name[32] = {0}; |
| snprintf(thread_name, IREE_ARRAYSIZE(thread_name), |
| "iree-hal-amdgpu-l0p%uq%u-complete", |
| (unsigned)iree_async_axis_device_index(axis), |
| (unsigned)iree_async_axis_queue_index(axis)); |
| thread_params.name = iree_make_cstring_view(thread_name); |
| thread_params.initial_affinity = completion_thread_affinity; |
| status = iree_thread_create( |
| iree_hal_amdgpu_host_queue_completion_thread_main, out_queue, |
| thread_params, host_allocator, &out_queue->completion.thread); |
| } |
| if (!iree_status_is_ok(status)) { |
| iree_hal_amdgpu_host_queue_deinitialize(out_queue); |
| } |
| |
| IREE_TRACE_ZONE_END(z0); |
| return status; |
| } |
| |
| void iree_hal_amdgpu_host_queue_deinitialize( |
| iree_hal_amdgpu_host_queue_t* queue) { |
| IREE_ASSERT_ARGUMENT(queue); |
| IREE_TRACE_ZONE_BEGIN(z0); |
| |
| iree_slim_mutex_lock(&queue->locks.submission_mutex); |
| queue->is_shutting_down = true; |
| iree_slim_mutex_unlock(&queue->locks.submission_mutex); |
| |
| if (queue->completion.thread) { |
| iree_hal_amdgpu_host_queue_request_completion_thread_stop(queue); |
| // There is only one owner for the thread, so this also joins the thread. |
| iree_thread_release(queue->completion.thread); |
| queue->completion.thread = NULL; |
| } |
| |
| // Destroy the hardware queue before the remaining host-side resources so the |
| // HSA runtime cannot race a late error callback against signal teardown. |
| if (queue->hardware_queue) { |
| iree_hal_amdgpu_hsa_cleanup_assert_success( |
| iree_hsa_queue_destroy_raw(queue->libhsa, queue->hardware_queue)); |
| queue->hardware_queue = NULL; |
| } |
| |
| // Capacity-parked pending ops are retried by post-drain callbacks. Flush |
| // those callbacks under shutdown first so they observe cancellation and own |
| // their normal failure path instead of being destroyed out from under the |
| // callback storage. |
| iree_hal_amdgpu_host_queue_run_post_drain_actions(queue); |
| |
| // Cancel all pending (deferred) operations. Their signal semaphores are |
| // failed with CANCELLED so downstream waiters don't hang. |
| if (queue->pending_head) { |
| iree_hal_amdgpu_host_queue_cancel_pending(queue, IREE_STATUS_CANCELLED, |
| "queue shutting down"); |
| } |
| |
| // Process any remaining notification entries before destroying resources. |
| // If the GPU faulted, fail all pending entries so waiters get the actual |
| // error. Otherwise drain normally (entries completed but not yet processed). |
| iree_status_t error = (iree_status_t)iree_atomic_load( |
| &queue->error_status, iree_memory_order_acquire); |
| uint64_t kernarg_reclaim_position = 0; |
| if (!iree_status_is_ok(error)) { |
| iree_hal_amdgpu_notification_ring_fail_all(&queue->notification_ring, error, |
| &kernarg_reclaim_position); |
| iree_hal_amdgpu_host_queue_clear_profile_events(queue); |
| iree_status_free(error); |
| } else { |
| iree_hal_amdgpu_notification_ring_drain( |
| &queue->notification_ring, |
| /*fallback_frontier=*/NULL, iree_hal_amdgpu_host_queue_reclaim_retired, |
| queue, &kernarg_reclaim_position); |
| } |
| if (kernarg_reclaim_position > 0) { |
| iree_hal_amdgpu_kernarg_ring_reclaim(&queue->kernarg_ring, |
| kernarg_reclaim_position); |
| } |
| iree_hal_amdgpu_host_queue_run_post_drain_actions(queue); |
| |
| // Deregister from the epoch signal table before destroying the notification |
| // ring (which owns the epoch signal). Guarded by epoch_table != NULL to |
| // handle partial initialization (init failed before registration). |
| if (queue->epoch_table) { |
| iree_hal_amdgpu_epoch_signal_table_deregister( |
| queue->epoch_table, iree_async_axis_device_index(queue->axis), |
| iree_async_axis_queue_index(queue->axis)); |
| queue->epoch_table = NULL; |
| } |
| |
| if (queue->frontier_tracker) { |
| iree_async_frontier_tracker_retire_axis( |
| queue->frontier_tracker, queue->axis, |
| iree_status_from_code(IREE_STATUS_CANCELLED)); |
| queue->frontier_tracker = NULL; |
| queue->axis = 0; |
| } |
| |
| iree_hal_amdgpu_notification_ring_deinitialize(&queue->notification_ring); |
| |
| iree_hal_amdgpu_kernarg_ring_deinitialize(queue->libhsa, |
| &queue->kernarg_ring); |
| |
| if (queue->pm4_ib_slots) { |
| iree_hal_amdgpu_hsa_cleanup_assert_success( |
| iree_hsa_amd_memory_pool_free_raw(queue->libhsa, queue->pm4_ib_slots)); |
| queue->pm4_ib_slots = NULL; |
| } |
| |
| iree_hal_amdgpu_host_queue_deallocate_profiling_completion_signals(queue); |
| iree_hal_amdgpu_host_queue_deallocate_profile_events(queue); |
| |
| if (queue->command_buffer_scratch) { |
| iree_allocator_free(queue->host_allocator, queue->command_buffer_scratch); |
| queue->command_buffer_scratch = NULL; |
| } |
| |
| if (queue->completion.stop_signal.handle) { |
| iree_hal_amdgpu_hsa_cleanup_assert_success(iree_hsa_signal_destroy_raw( |
| queue->libhsa, queue->completion.stop_signal)); |
| queue->completion.stop_signal.handle = 0; |
| } |
| |
| iree_slim_mutex_deinitialize(&queue->locks.post_drain_mutex); |
| iree_slim_mutex_deinitialize(&queue->profiling.event_mutex); |
| iree_slim_mutex_deinitialize(&queue->locks.submission_mutex); |
| |
| IREE_TRACE_ZONE_END(z0); |
| } |
| |
| iree_status_t iree_hal_amdgpu_host_queue_set_hsa_profiling_enabled( |
| iree_hal_amdgpu_host_queue_t* queue, bool enabled) { |
| IREE_ASSERT_ARGUMENT(queue); |
| IREE_TRACE_ZONE_BEGIN(z0); |
| IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, enabled ? 1 : 0); |
| |
| if (enabled) { |
| IREE_RETURN_AND_END_ZONE_IF_ERROR( |
| z0, iree_hal_amdgpu_host_queue_ensure_profile_event_storage(queue)); |
| IREE_RETURN_AND_END_ZONE_IF_ERROR( |
| z0, |
| iree_hal_amdgpu_host_queue_ensure_profiling_completion_signals(queue)); |
| iree_hal_amdgpu_host_queue_clear_profile_events(queue); |
| } |
| |
| iree_status_t status = iree_hsa_amd_profiling_set_profiler_enabled( |
| IREE_LIBHSA(queue->libhsa), queue->hardware_queue, enabled ? 1 : 0); |
| if (iree_status_is_ok(status)) { |
| queue->profiling.hsa_queue_timestamps_enabled = enabled ? 1 : 0; |
| } |
| |
| IREE_TRACE_ZONE_END(z0); |
| return status; |
| } |
| |
| static void iree_hal_amdgpu_host_queue_trim( |
| iree_hal_amdgpu_virtual_queue_t* base_queue) {} |
| |
| //===----------------------------------------------------------------------===// |
| // Queue operations |
| //===----------------------------------------------------------------------===// |
| |
| typedef struct iree_hal_amdgpu_host_queue_op_submission_t { |
| // Queue whose submission_mutex is held between begin/end. |
| iree_hal_amdgpu_host_queue_t* queue; |
| |
| // Wait resolution computed while holding submission_mutex. |
| iree_hal_amdgpu_wait_resolution_t resolution; |
| |
| // Deferred operation captured while holding submission_mutex, if any. |
| iree_hal_amdgpu_pending_op_t* deferred_op; |
| |
| // Number of input waits. Capacity retries only need post-drain resubmission |
| // when no semantic waits are available to naturally re-enter the queue. |
| iree_host_size_t wait_semaphore_count; |
| |
| // Whether the direct submit helper found enough queue capacity. |
| bool ready; |
| |
| // Whether |deferred_op| should retry on the completion thread after drain. |
| bool wait_for_capacity; |
| } iree_hal_amdgpu_host_queue_op_submission_t; |
| |
| // Begins one direct/deferred queue operation attempt. The caller must pair this |
| // with iree_hal_amdgpu_host_queue_op_submission_end exactly once. |
| static inline void iree_hal_amdgpu_host_queue_op_submission_begin( |
| iree_hal_amdgpu_host_queue_t* queue, |
| const iree_hal_semaphore_list_t wait_semaphore_list, |
| iree_hal_amdgpu_host_queue_op_submission_t* out_submission) { |
| out_submission->queue = queue; |
| out_submission->deferred_op = NULL; |
| out_submission->wait_semaphore_count = wait_semaphore_list.count; |
| out_submission->ready = true; |
| out_submission->wait_for_capacity = false; |
| |
| iree_slim_mutex_lock(&queue->locks.submission_mutex); |
| iree_hal_amdgpu_host_queue_resolve_waits(queue, wait_semaphore_list, |
| &out_submission->resolution); |
| } |
| |
| // Marks a captured pending op as retrying after completion-thread drain because |
| // direct submission ran out of queue capacity. |
| static inline void iree_hal_amdgpu_host_queue_op_submission_defer_for_capacity( |
| iree_hal_amdgpu_host_queue_op_submission_t* submission) { |
| submission->wait_for_capacity = submission->wait_semaphore_count == 0; |
| } |
| |
| // Ends one direct/deferred queue operation attempt by releasing |
| // submission_mutex and starting any captured pending op outside the lock. |
| static inline iree_status_t iree_hal_amdgpu_host_queue_op_submission_end( |
| iree_hal_amdgpu_host_queue_op_submission_t* submission, |
| iree_status_t status) { |
| iree_slim_mutex_unlock(&submission->queue->locks.submission_mutex); |
| |
| if (iree_status_is_ok(status) && submission->deferred_op) { |
| status = iree_hal_amdgpu_pending_op_start(submission->deferred_op, |
| submission->wait_for_capacity); |
| } |
| return status; |
| } |
| |
| static iree_status_t iree_hal_amdgpu_host_queue_signal_empty_barrier( |
| iree_hal_amdgpu_host_queue_t* queue, |
| const iree_hal_semaphore_list_t signal_semaphore_list) { |
| iree_slim_mutex_lock(&queue->locks.submission_mutex); |
| iree_status_t status = iree_ok_status(); |
| if (IREE_UNLIKELY(queue->is_shutting_down)) { |
| status = iree_make_status(IREE_STATUS_CANCELLED, "queue shutting down"); |
| } |
| iree_slim_mutex_unlock(&queue->locks.submission_mutex); |
| |
| if (iree_status_is_ok(status)) { |
| // Signal outside submission_mutex: semaphore signaling dispatches satisfied |
| // timepoints, and those callbacks may submit additional queue work. |
| status = iree_hal_semaphore_list_signal(signal_semaphore_list, |
| /*frontier=*/NULL); |
| } |
| return status; |
| } |
| |
| static iree_status_t iree_hal_amdgpu_host_queue_execute( |
| iree_hal_amdgpu_virtual_queue_t* base_queue, |
| const iree_hal_semaphore_list_t wait_semaphore_list, |
| const iree_hal_semaphore_list_t signal_semaphore_list, |
| iree_hal_command_buffer_t* command_buffer, |
| iree_hal_buffer_binding_table_t binding_table, |
| iree_hal_execute_flags_t flags) { |
| iree_hal_amdgpu_host_queue_t* queue = |
| (iree_hal_amdgpu_host_queue_t*)base_queue; |
| |
| IREE_RETURN_IF_ERROR( |
| iree_hal_amdgpu_host_queue_validate_execute_flags(flags)); |
| |
| if (!command_buffer && wait_semaphore_list.count == 0) { |
| if (IREE_UNLIKELY(binding_table.count != 0)) { |
| return iree_make_status( |
| IREE_STATUS_INVALID_ARGUMENT, |
| "barrier-only queue_execute must not provide a binding table " |
| "(count=%" PRIhsz ")", |
| binding_table.count); |
| } |
| return iree_hal_amdgpu_host_queue_signal_empty_barrier( |
| queue, signal_semaphore_list); |
| } |
| |
| iree_hal_amdgpu_host_queue_op_submission_t submission; |
| iree_hal_amdgpu_host_queue_op_submission_begin(queue, wait_semaphore_list, |
| &submission); |
| iree_status_t status = iree_ok_status(); |
| if (submission.resolution.needs_deferral) { |
| status = iree_hal_amdgpu_host_queue_defer_execute( |
| queue, &wait_semaphore_list, &signal_semaphore_list, command_buffer, |
| binding_table, flags, &submission.deferred_op); |
| } else if (!command_buffer) { |
| if (IREE_UNLIKELY(binding_table.count != 0)) { |
| status = iree_make_status( |
| IREE_STATUS_INVALID_ARGUMENT, |
| "barrier-only queue_execute must not provide a binding table " |
| "(count=%" PRIhsz ")", |
| binding_table.count); |
| } else { |
| uint64_t submission_id = 0; |
| iree_hal_amdgpu_host_queue_profile_event_info_t profile_event_info = { |
| .type = IREE_HAL_PROFILE_QUEUE_EVENT_TYPE_BARRIER, |
| .operation_count = 0, |
| }; |
| status = iree_hal_amdgpu_host_queue_try_submit_barrier( |
| queue, &submission.resolution, signal_semaphore_list, |
| (iree_hal_amdgpu_reclaim_action_t){0}, |
| /*operation_resources=*/NULL, |
| /*operation_resource_count=*/0, &profile_event_info, |
| iree_hal_amdgpu_host_queue_post_commit_callback_null(), |
| /*resource_set=*/NULL, |
| IREE_HAL_AMDGPU_HOST_QUEUE_SUBMISSION_FLAG_RETAIN_RESOURCES, |
| &submission.ready, &submission_id); |
| if (iree_status_is_ok(status) && submission.ready) { |
| profile_event_info.submission_id = submission_id; |
| iree_hal_amdgpu_host_queue_record_profile_queue_event( |
| queue, &submission.resolution, signal_semaphore_list, |
| &profile_event_info); |
| } |
| if (iree_status_is_ok(status) && !submission.ready) { |
| status = iree_hal_amdgpu_host_queue_defer_execute( |
| queue, &wait_semaphore_list, &signal_semaphore_list, |
| /*command_buffer=*/NULL, iree_hal_buffer_binding_table_empty(), |
| flags, &submission.deferred_op); |
| iree_hal_amdgpu_host_queue_op_submission_defer_for_capacity( |
| &submission); |
| } |
| } |
| } else { |
| iree_hal_resource_set_t* binding_resource_set = NULL; |
| status = iree_hal_amdgpu_host_queue_submit_command_buffer( |
| queue, &submission.resolution, signal_semaphore_list, command_buffer, |
| binding_table, flags, &binding_resource_set, &submission.ready); |
| if (iree_status_is_ok(status) && !submission.ready) { |
| iree_hal_resource_set_free(binding_resource_set); |
| status = iree_hal_amdgpu_host_queue_defer_execute( |
| queue, &wait_semaphore_list, &signal_semaphore_list, command_buffer, |
| binding_table, flags, &submission.deferred_op); |
| iree_hal_amdgpu_host_queue_op_submission_defer_for_capacity(&submission); |
| } else if (!iree_status_is_ok(status)) { |
| iree_hal_resource_set_free(binding_resource_set); |
| } |
| } |
| return iree_hal_amdgpu_host_queue_op_submission_end(&submission, status); |
| } |
| |
| static iree_status_t iree_hal_amdgpu_host_queue_alloca( |
| iree_hal_amdgpu_virtual_queue_t* base_queue, |
| const iree_hal_semaphore_list_t wait_semaphore_list, |
| const iree_hal_semaphore_list_t signal_semaphore_list, |
| iree_hal_pool_t* pool, iree_hal_buffer_params_t params, |
| iree_device_size_t allocation_size, iree_hal_alloca_flags_t flags, |
| iree_hal_buffer_t** IREE_RESTRICT out_buffer) { |
| IREE_ASSERT_ARGUMENT(out_buffer); |
| *out_buffer = NULL; |
| |
| iree_hal_amdgpu_host_queue_t* queue = |
| (iree_hal_amdgpu_host_queue_t*)base_queue; |
| |
| iree_hal_pool_t* allocation_pool = NULL; |
| iree_hal_buffer_t* buffer = NULL; |
| IREE_RETURN_IF_ERROR(iree_hal_amdgpu_host_queue_prepare_alloca_wrapper( |
| queue, pool, ¶ms, allocation_size, flags, &allocation_pool, &buffer)); |
| // Always ask the pool to surface waitable death-frontier candidates so the |
| // queue can distinguish true pool pressure from a dependency the caller did |
| // not authorize. The HAL alloca flag is checked before consuming any |
| // OK_NEEDS_WAIT reservation. Disallow growth while submission_mutex is held; |
| // growable pools report that as a cold retry instead of calling into their |
| // slab provider on the serialized queue path. |
| const iree_hal_pool_reserve_flags_t reserve_flags = |
| IREE_HAL_POOL_RESERVE_FLAG_ALLOW_WAIT_FRONTIER | |
| IREE_HAL_POOL_RESERVE_FLAG_DISALLOW_GROWTH; |
| |
| iree_hal_amdgpu_host_queue_op_submission_t submission; |
| iree_hal_amdgpu_host_queue_op_submission_begin(queue, wait_semaphore_list, |
| &submission); |
| iree_status_t status = iree_ok_status(); |
| iree_hal_amdgpu_pending_op_t* memory_wait_op = NULL; |
| if (submission.resolution.needs_deferral) { |
| status = iree_hal_amdgpu_host_queue_defer_alloca( |
| queue, &wait_semaphore_list, &signal_semaphore_list, allocation_pool, |
| params, allocation_size, flags, reserve_flags, buffer, |
| &submission.deferred_op); |
| } else { |
| status = iree_hal_amdgpu_host_queue_submit_alloca( |
| queue, &submission.resolution, signal_semaphore_list, allocation_pool, |
| params, allocation_size, flags, reserve_flags, buffer, |
| IREE_HAL_AMDGPU_HOST_QUEUE_SUBMISSION_FLAG_RETAIN_RESOURCES, |
| /*pending_op=*/NULL, &memory_wait_op, &submission.ready); |
| if (iree_status_is_ok(status) && !submission.ready && !memory_wait_op) { |
| status = iree_hal_amdgpu_host_queue_defer_alloca( |
| queue, &wait_semaphore_list, &signal_semaphore_list, allocation_pool, |
| params, allocation_size, flags, reserve_flags, buffer, |
| &submission.deferred_op); |
| iree_hal_amdgpu_host_queue_op_submission_defer_for_capacity(&submission); |
| } |
| } |
| status = iree_hal_amdgpu_host_queue_op_submission_end(&submission, status); |
| if (iree_status_is_ok(status) && memory_wait_op) { |
| iree_hal_amdgpu_pending_op_enqueue_alloca_memory_wait(memory_wait_op); |
| } |
| |
| if (iree_status_is_ok(status)) { |
| *out_buffer = buffer; |
| } else { |
| iree_hal_buffer_release(buffer); |
| } |
| return status; |
| } |
| |
| static iree_status_t iree_hal_amdgpu_host_queue_dealloca( |
| iree_hal_amdgpu_virtual_queue_t* base_queue, |
| const iree_hal_semaphore_list_t wait_semaphore_list, |
| const iree_hal_semaphore_list_t signal_semaphore_list, |
| iree_hal_buffer_t* buffer, iree_hal_dealloca_flags_t flags) { |
| iree_hal_amdgpu_host_queue_t* queue = |
| (iree_hal_amdgpu_host_queue_t*)base_queue; |
| |
| if (IREE_UNLIKELY( |
| iree_any_bit_set(flags, ~(IREE_HAL_DEALLOCA_FLAG_NONE | |
| IREE_HAL_DEALLOCA_FLAG_PREFER_ORIGIN)))) { |
| return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, |
| "unsupported dealloca flags: 0x%" PRIx64, flags); |
| } |
| |
| // iree_hal_device_queue_dealloca() applies PREFER_ORIGIN before vtable |
| // dispatch by rewriting the device and queue affinity from the buffer's |
| // allocation placement. Transient wrappers created by queue_alloca carry this |
| // queue's one-bit affinity in that placement, so this host-queue path can use |
| // |base_queue| directly. |
| if (!iree_hal_amdgpu_transient_buffer_isa(buffer)) { |
| return iree_hal_amdgpu_host_queue_execute( |
| base_queue, wait_semaphore_list, signal_semaphore_list, |
| /*command_buffer=*/NULL, iree_hal_buffer_binding_table_empty(), |
| IREE_HAL_EXECUTE_FLAG_NONE); |
| } |
| |
| if (IREE_UNLIKELY(!iree_hal_amdgpu_transient_buffer_begin_dealloca(buffer))) { |
| return iree_make_status( |
| IREE_STATUS_FAILED_PRECONDITION, |
| "transient buffer has already been queued for deallocation"); |
| } |
| |
| iree_hal_amdgpu_host_queue_op_submission_t submission; |
| iree_hal_amdgpu_host_queue_op_submission_begin(queue, wait_semaphore_list, |
| &submission); |
| iree_status_t status = iree_ok_status(); |
| if (submission.resolution.needs_deferral) { |
| status = iree_hal_amdgpu_host_queue_defer_dealloca( |
| queue, &wait_semaphore_list, &signal_semaphore_list, buffer, |
| &submission.deferred_op); |
| } else { |
| status = iree_hal_amdgpu_host_queue_submit_dealloca( |
| queue, &submission.resolution, signal_semaphore_list, buffer, |
| IREE_HAL_AMDGPU_HOST_QUEUE_SUBMISSION_FLAG_RETAIN_RESOURCES, |
| &submission.ready); |
| if (iree_status_is_ok(status) && !submission.ready) { |
| status = iree_hal_amdgpu_host_queue_defer_dealloca( |
| queue, &wait_semaphore_list, &signal_semaphore_list, buffer, |
| &submission.deferred_op); |
| iree_hal_amdgpu_host_queue_op_submission_defer_for_capacity(&submission); |
| } |
| } |
| status = iree_hal_amdgpu_host_queue_op_submission_end(&submission, status); |
| if (!iree_status_is_ok(status)) { |
| iree_hal_amdgpu_transient_buffer_abort_dealloca(buffer); |
| } |
| return status; |
| } |
| |
| // Queue fill entry point. Resolves waits under submission_mutex and captures a |
| // pending operation only when waits or submission capacity require deferral. |
| static iree_status_t iree_hal_amdgpu_host_queue_fill( |
| iree_hal_amdgpu_virtual_queue_t* base_queue, |
| const iree_hal_semaphore_list_t wait_semaphore_list, |
| const iree_hal_semaphore_list_t signal_semaphore_list, |
| iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset, |
| iree_device_size_t length, uint64_t pattern_bits, |
| iree_host_size_t pattern_length, iree_hal_fill_flags_t flags) { |
| iree_hal_amdgpu_host_queue_t* queue = |
| (iree_hal_amdgpu_host_queue_t*)base_queue; |
| |
| iree_hal_amdgpu_host_queue_op_submission_t submission; |
| iree_hal_amdgpu_host_queue_op_submission_begin(queue, wait_semaphore_list, |
| &submission); |
| iree_status_t status = iree_ok_status(); |
| if (submission.resolution.needs_deferral) { |
| status = iree_hal_amdgpu_host_queue_defer_fill( |
| queue, &wait_semaphore_list, &signal_semaphore_list, target_buffer, |
| target_offset, length, pattern_bits, pattern_length, flags, |
| &submission.deferred_op); |
| } else { |
| status = iree_hal_amdgpu_host_queue_submit_fill( |
| queue, &submission.resolution, signal_semaphore_list, target_buffer, |
| target_offset, length, pattern_bits, pattern_length, flags, |
| IREE_HAL_AMDGPU_HOST_QUEUE_SUBMISSION_FLAG_RETAIN_RESOURCES, |
| &submission.ready); |
| if (iree_status_is_ok(status) && !submission.ready) { |
| status = iree_hal_amdgpu_host_queue_defer_fill( |
| queue, &wait_semaphore_list, &signal_semaphore_list, target_buffer, |
| target_offset, length, pattern_bits, pattern_length, flags, |
| &submission.deferred_op); |
| iree_hal_amdgpu_host_queue_op_submission_defer_for_capacity(&submission); |
| } |
| } |
| return iree_hal_amdgpu_host_queue_op_submission_end(&submission, status); |
| } |
| |
| iree_status_t iree_hal_amdgpu_host_queue_copy_buffer( |
| iree_hal_amdgpu_host_queue_t* queue, |
| const iree_hal_semaphore_list_t wait_semaphore_list, |
| const iree_hal_semaphore_list_t signal_semaphore_list, |
| iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset, |
| iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset, |
| iree_device_size_t length, iree_hal_copy_flags_t flags, |
| iree_hal_profile_queue_event_type_t profile_event_type) { |
| iree_hal_amdgpu_host_queue_op_submission_t submission; |
| iree_hal_amdgpu_host_queue_op_submission_begin(queue, wait_semaphore_list, |
| &submission); |
| iree_status_t status = iree_ok_status(); |
| if (submission.resolution.needs_deferral) { |
| status = iree_hal_amdgpu_host_queue_defer_copy( |
| queue, &wait_semaphore_list, &signal_semaphore_list, source_buffer, |
| source_offset, target_buffer, target_offset, length, flags, |
| profile_event_type, &submission.deferred_op); |
| } else { |
| status = iree_hal_amdgpu_host_queue_submit_copy( |
| queue, &submission.resolution, signal_semaphore_list, source_buffer, |
| source_offset, target_buffer, target_offset, length, flags, |
| profile_event_type, |
| IREE_HAL_AMDGPU_HOST_QUEUE_SUBMISSION_FLAG_RETAIN_RESOURCES, |
| &submission.ready); |
| if (iree_status_is_ok(status) && !submission.ready) { |
| status = iree_hal_amdgpu_host_queue_defer_copy( |
| queue, &wait_semaphore_list, &signal_semaphore_list, source_buffer, |
| source_offset, target_buffer, target_offset, length, flags, |
| profile_event_type, &submission.deferred_op); |
| iree_hal_amdgpu_host_queue_op_submission_defer_for_capacity(&submission); |
| } |
| } |
| return iree_hal_amdgpu_host_queue_op_submission_end(&submission, status); |
| } |
| |
| // Queue copy entry point. The shared copy path is also used by file read/write |
| // staging so all copy-shaped operations use the same wait/backpressure path. |
| static iree_status_t iree_hal_amdgpu_host_queue_copy( |
| iree_hal_amdgpu_virtual_queue_t* base_queue, |
| const iree_hal_semaphore_list_t wait_semaphore_list, |
| const iree_hal_semaphore_list_t signal_semaphore_list, |
| iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset, |
| iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset, |
| iree_device_size_t length, iree_hal_copy_flags_t flags) { |
| return iree_hal_amdgpu_host_queue_copy_buffer( |
| (iree_hal_amdgpu_host_queue_t*)base_queue, wait_semaphore_list, |
| signal_semaphore_list, source_buffer, source_offset, target_buffer, |
| target_offset, length, flags, IREE_HAL_PROFILE_QUEUE_EVENT_TYPE_COPY); |
| } |
| |
| // Queue update entry point. Immediate updates copy into queue-owned kernarg |
| // memory; deferred updates copy into the pending-op arena. |
| static iree_status_t iree_hal_amdgpu_host_queue_update( |
| iree_hal_amdgpu_virtual_queue_t* base_queue, |
| const iree_hal_semaphore_list_t wait_semaphore_list, |
| const iree_hal_semaphore_list_t signal_semaphore_list, |
| const void* source_buffer, iree_host_size_t source_offset, |
| iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset, |
| iree_device_size_t length, iree_hal_update_flags_t flags) { |
| iree_hal_amdgpu_host_queue_t* queue = |
| (iree_hal_amdgpu_host_queue_t*)base_queue; |
| |
| iree_hal_amdgpu_host_queue_op_submission_t submission; |
| iree_hal_amdgpu_host_queue_op_submission_begin(queue, wait_semaphore_list, |
| &submission); |
| iree_status_t status = iree_ok_status(); |
| if (submission.resolution.needs_deferral) { |
| status = iree_hal_amdgpu_host_queue_defer_update( |
| queue, &wait_semaphore_list, &signal_semaphore_list, source_buffer, |
| source_offset, target_buffer, target_offset, length, flags, |
| &submission.deferred_op); |
| } else { |
| status = iree_hal_amdgpu_host_queue_submit_update( |
| queue, &submission.resolution, signal_semaphore_list, source_buffer, |
| source_offset, target_buffer, target_offset, length, flags, |
| IREE_HAL_AMDGPU_HOST_QUEUE_SUBMISSION_FLAG_RETAIN_RESOURCES, |
| &submission.ready); |
| if (iree_status_is_ok(status) && !submission.ready) { |
| status = iree_hal_amdgpu_host_queue_defer_update( |
| queue, &wait_semaphore_list, &signal_semaphore_list, source_buffer, |
| source_offset, target_buffer, target_offset, length, flags, |
| &submission.deferred_op); |
| iree_hal_amdgpu_host_queue_op_submission_defer_for_capacity(&submission); |
| } |
| } |
| return iree_hal_amdgpu_host_queue_op_submission_end(&submission, status); |
| } |
| |
| static bool iree_hal_amdgpu_host_queue_is_noop_dispatch( |
| const iree_hal_dispatch_config_t config, iree_hal_dispatch_flags_t flags) { |
| return !iree_hal_dispatch_uses_indirect_parameters(flags) && |
| (config.workgroup_count[0] | config.workgroup_count[1] | |
| config.workgroup_count[2]) == 0; |
| } |
| |
| // Queue dispatch entry point. Empty direct dispatches route through the barrier |
| // path so they still signal semaphores and profile as dispatch submissions. |
| static iree_status_t iree_hal_amdgpu_host_queue_dispatch( |
| iree_hal_amdgpu_virtual_queue_t* base_queue, |
| const iree_hal_semaphore_list_t wait_semaphore_list, |
| const iree_hal_semaphore_list_t signal_semaphore_list, |
| iree_hal_executable_t* executable, |
| iree_hal_executable_export_ordinal_t export_ordinal, |
| const iree_hal_dispatch_config_t config, iree_const_byte_span_t constants, |
| const iree_hal_buffer_ref_list_t bindings, |
| iree_hal_dispatch_flags_t flags) { |
| iree_hal_amdgpu_host_queue_t* queue = |
| (iree_hal_amdgpu_host_queue_t*)base_queue; |
| const bool is_noop_dispatch = |
| iree_hal_amdgpu_host_queue_is_noop_dispatch(config, flags); |
| |
| iree_hal_amdgpu_host_queue_op_submission_t submission; |
| iree_hal_amdgpu_host_queue_op_submission_begin(queue, wait_semaphore_list, |
| &submission); |
| iree_status_t status = iree_ok_status(); |
| if (submission.resolution.needs_deferral) { |
| if (is_noop_dispatch) { |
| status = iree_hal_amdgpu_host_queue_defer_execute( |
| queue, &wait_semaphore_list, &signal_semaphore_list, |
| /*command_buffer=*/NULL, iree_hal_buffer_binding_table_empty(), |
| IREE_HAL_EXECUTE_FLAG_NONE, &submission.deferred_op); |
| } else { |
| status = iree_hal_amdgpu_host_queue_defer_dispatch( |
| queue, &wait_semaphore_list, &signal_semaphore_list, executable, |
| export_ordinal, config, constants, bindings, flags, |
| &submission.deferred_op); |
| } |
| } else if (is_noop_dispatch) { |
| uint64_t submission_id = 0; |
| iree_hal_amdgpu_host_queue_profile_event_info_t profile_event_info = { |
| .type = IREE_HAL_PROFILE_QUEUE_EVENT_TYPE_DISPATCH, |
| .operation_count = 0, |
| }; |
| status = iree_hal_amdgpu_host_queue_try_submit_barrier( |
| queue, &submission.resolution, signal_semaphore_list, |
| (iree_hal_amdgpu_reclaim_action_t){0}, |
| /*operation_resources=*/NULL, |
| /*operation_resource_count=*/0, &profile_event_info, |
| iree_hal_amdgpu_host_queue_post_commit_callback_null(), |
| /*resource_set=*/NULL, |
| IREE_HAL_AMDGPU_HOST_QUEUE_SUBMISSION_FLAG_RETAIN_RESOURCES, |
| &submission.ready, &submission_id); |
| if (iree_status_is_ok(status) && submission.ready) { |
| profile_event_info.submission_id = submission_id; |
| iree_hal_amdgpu_host_queue_record_profile_queue_event( |
| queue, &submission.resolution, signal_semaphore_list, |
| &profile_event_info); |
| } |
| if (iree_status_is_ok(status) && !submission.ready) { |
| status = iree_hal_amdgpu_host_queue_defer_execute( |
| queue, &wait_semaphore_list, &signal_semaphore_list, |
| /*command_buffer=*/NULL, iree_hal_buffer_binding_table_empty(), |
| IREE_HAL_EXECUTE_FLAG_NONE, &submission.deferred_op); |
| iree_hal_amdgpu_host_queue_op_submission_defer_for_capacity(&submission); |
| } |
| } else { |
| status = iree_hal_amdgpu_host_queue_submit_dispatch( |
| queue, &submission.resolution, signal_semaphore_list, executable, |
| export_ordinal, config, constants, bindings, flags, |
| IREE_HAL_AMDGPU_HOST_QUEUE_SUBMISSION_FLAG_RETAIN_RESOURCES, |
| &submission.ready); |
| if (iree_status_is_ok(status) && !submission.ready) { |
| status = iree_hal_amdgpu_host_queue_defer_dispatch( |
| queue, &wait_semaphore_list, &signal_semaphore_list, executable, |
| export_ordinal, config, constants, bindings, flags, |
| &submission.deferred_op); |
| iree_hal_amdgpu_host_queue_op_submission_defer_for_capacity(&submission); |
| } |
| } |
| return iree_hal_amdgpu_host_queue_op_submission_end(&submission, status); |
| } |
| |
| static iree_status_t iree_hal_amdgpu_host_queue_read( |
| iree_hal_amdgpu_virtual_queue_t* base_queue, |
| const iree_hal_semaphore_list_t wait_semaphore_list, |
| const iree_hal_semaphore_list_t signal_semaphore_list, |
| iree_hal_file_t* source_file, uint64_t source_offset, |
| iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset, |
| iree_device_size_t length, iree_hal_read_flags_t flags) { |
| return iree_hal_amdgpu_host_queue_read_file( |
| base_queue, wait_semaphore_list, signal_semaphore_list, source_file, |
| source_offset, target_buffer, target_offset, length, flags); |
| } |
| |
| static iree_status_t iree_hal_amdgpu_host_queue_write( |
| iree_hal_amdgpu_virtual_queue_t* base_queue, |
| const iree_hal_semaphore_list_t wait_semaphore_list, |
| const iree_hal_semaphore_list_t signal_semaphore_list, |
| iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset, |
| iree_hal_file_t* target_file, uint64_t target_offset, |
| iree_device_size_t length, iree_hal_write_flags_t flags) { |
| return iree_hal_amdgpu_host_queue_write_file( |
| base_queue, wait_semaphore_list, signal_semaphore_list, source_buffer, |
| source_offset, target_file, target_offset, length, flags); |
| } |
| |
| iree_status_t iree_hal_amdgpu_host_queue_enqueue_host_action( |
| iree_hal_amdgpu_host_queue_t* queue, |
| const iree_hal_semaphore_list_t wait_semaphore_list, |
| iree_hal_amdgpu_reclaim_action_t action, |
| iree_hal_resource_t* const* operation_resources, |
| iree_host_size_t operation_resource_count) { |
| if (IREE_UNLIKELY(!action.fn)) { |
| return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, |
| "host action callback must be non-null"); |
| } |
| if (IREE_UNLIKELY(operation_resource_count > 0 && !operation_resources)) { |
| return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, |
| "host action resources must be non-null"); |
| } |
| |
| iree_hal_amdgpu_host_queue_op_submission_t submission; |
| iree_hal_amdgpu_host_queue_op_submission_begin(queue, wait_semaphore_list, |
| &submission); |
| // Host actions execute on CPU threads and must observe device-produced |
| // host-visible memory even when a semaphore edge itself is device-local. |
| submission.resolution.inline_acquire_scope = |
| iree_hal_amdgpu_host_queue_max_fence_scope( |
| submission.resolution.inline_acquire_scope, |
| IREE_HSA_FENCE_SCOPE_SYSTEM); |
| submission.resolution.barrier_acquire_scope = |
| iree_hal_amdgpu_host_queue_max_fence_scope( |
| submission.resolution.barrier_acquire_scope, |
| IREE_HSA_FENCE_SCOPE_SYSTEM); |
| iree_status_t status = iree_ok_status(); |
| if (submission.resolution.needs_deferral) { |
| status = iree_hal_amdgpu_host_queue_defer_host_action( |
| queue, &wait_semaphore_list, action, operation_resources, |
| operation_resource_count, &submission.deferred_op); |
| } else { |
| status = iree_hal_amdgpu_host_queue_try_submit_barrier( |
| queue, &submission.resolution, iree_hal_semaphore_list_empty(), action, |
| operation_resources, operation_resource_count, |
| /*profile_event_info=*/NULL, |
| iree_hal_amdgpu_host_queue_post_commit_callback_null(), |
| /*resource_set=*/NULL, |
| IREE_HAL_AMDGPU_HOST_QUEUE_SUBMISSION_FLAG_RETAIN_RESOURCES, |
| &submission.ready, /*out_submission_id=*/NULL); |
| if (iree_status_is_ok(status) && !submission.ready) { |
| status = iree_hal_amdgpu_host_queue_defer_host_action( |
| queue, &wait_semaphore_list, action, operation_resources, |
| operation_resource_count, &submission.deferred_op); |
| iree_hal_amdgpu_host_queue_op_submission_defer_for_capacity(&submission); |
| } |
| } |
| return iree_hal_amdgpu_host_queue_op_submission_end(&submission, status); |
| } |
| |
| static iree_status_t iree_hal_amdgpu_host_queue_host_call( |
| iree_hal_amdgpu_virtual_queue_t* base_queue, |
| const iree_hal_semaphore_list_t wait_semaphore_list, |
| const iree_hal_semaphore_list_t signal_semaphore_list, |
| iree_hal_host_call_t call, const uint64_t args[4], |
| iree_hal_host_call_flags_t flags) { |
| IREE_RETURN_IF_ERROR( |
| iree_hal_amdgpu_host_queue_validate_host_call(call, args, flags)); |
| |
| iree_hal_amdgpu_host_queue_t* queue = |
| (iree_hal_amdgpu_host_queue_t*)base_queue; |
| |
| iree_hal_amdgpu_host_queue_op_submission_t submission; |
| iree_hal_amdgpu_host_queue_op_submission_begin(queue, wait_semaphore_list, |
| &submission); |
| iree_status_t status = iree_ok_status(); |
| if (submission.resolution.needs_deferral) { |
| status = iree_hal_amdgpu_host_queue_defer_host_call( |
| queue, &wait_semaphore_list, &signal_semaphore_list, call, args, flags, |
| &submission.deferred_op); |
| } else { |
| status = iree_hal_amdgpu_host_queue_submit_host_call( |
| queue, &submission.resolution, signal_semaphore_list, call, args, flags, |
| &submission.ready); |
| if (iree_status_is_ok(status) && !submission.ready) { |
| status = iree_hal_amdgpu_host_queue_defer_host_call( |
| queue, &wait_semaphore_list, &signal_semaphore_list, call, args, |
| flags, &submission.deferred_op); |
| iree_hal_amdgpu_host_queue_op_submission_defer_for_capacity(&submission); |
| } |
| } |
| return iree_hal_amdgpu_host_queue_op_submission_end(&submission, status); |
| } |
| |
| static iree_status_t iree_hal_amdgpu_host_queue_flush( |
| iree_hal_amdgpu_virtual_queue_t* base_queue) { |
| return iree_ok_status(); |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // Virtual queue vtable |
| //===----------------------------------------------------------------------===// |
| |
| static void iree_hal_amdgpu_host_queue_deinitialize_vtable( |
| iree_hal_amdgpu_virtual_queue_t* base_queue) { |
| iree_hal_amdgpu_host_queue_deinitialize( |
| (iree_hal_amdgpu_host_queue_t*)base_queue); |
| } |
| |
| static const iree_hal_amdgpu_virtual_queue_vtable_t |
| iree_hal_amdgpu_host_queue_vtable = { |
| .deinitialize = iree_hal_amdgpu_host_queue_deinitialize_vtable, |
| .trim = iree_hal_amdgpu_host_queue_trim, |
| .alloca = iree_hal_amdgpu_host_queue_alloca, |
| .dealloca = iree_hal_amdgpu_host_queue_dealloca, |
| .fill = iree_hal_amdgpu_host_queue_fill, |
| .update = iree_hal_amdgpu_host_queue_update, |
| .copy = iree_hal_amdgpu_host_queue_copy, |
| .read = iree_hal_amdgpu_host_queue_read, |
| .write = iree_hal_amdgpu_host_queue_write, |
| .host_call = iree_hal_amdgpu_host_queue_host_call, |
| .dispatch = iree_hal_amdgpu_host_queue_dispatch, |
| .execute = iree_hal_amdgpu_host_queue_execute, |
| .flush = iree_hal_amdgpu_host_queue_flush, |
| }; |