runtime/src/iree/hal/drivers/amdgpu/host_queue.c - 3p/openxla/iree - Git at Google

 // Copyright 2026 The IREE Authors
 //
 // Licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 #include "iree/hal/drivers/amdgpu/host_queue.h"

 #include <stdio.h>
 #include <string.h>

 #include "iree/async/frontier_tracker.h"
 #include "iree/async/notification.h"
 #include "iree/base/threading/thread.h"
 #include "iree/hal/drivers/amdgpu/host_queue_blit.h"
 #include "iree/hal/drivers/amdgpu/host_queue_command_buffer.h"
 #include "iree/hal/drivers/amdgpu/host_queue_command_buffer_scratch.h"
 #include "iree/hal/drivers/amdgpu/host_queue_dispatch.h"
 #include "iree/hal/drivers/amdgpu/host_queue_file.h"
 #include "iree/hal/drivers/amdgpu/host_queue_host_call.h"
 #include "iree/hal/drivers/amdgpu/host_queue_memory.h"
 #include "iree/hal/drivers/amdgpu/host_queue_pending.h"
 #include "iree/hal/drivers/amdgpu/host_queue_policy.h"
 #include "iree/hal/drivers/amdgpu/host_queue_profile.h"
 #include "iree/hal/drivers/amdgpu/host_queue_profile_events.h"
 #include "iree/hal/drivers/amdgpu/host_queue_submission.h"
 #include "iree/hal/drivers/amdgpu/host_queue_waits.h"
 #include "iree/hal/drivers/amdgpu/semaphore.h"
 #include "iree/hal/drivers/amdgpu/transient_buffer.h"
 #include "iree/hal/drivers/amdgpu/util/pm4_emitter.h"
 #include "iree/hal/utils/resource_set.h"

 static const iree_hal_amdgpu_virtual_queue_vtable_t
     iree_hal_amdgpu_host_queue_vtable;

 static iree_status_t iree_hal_amdgpu_host_queue_allocate_pm4_ib_slots(
     const iree_hal_amdgpu_libhsa_t* libhsa, hsa_agent_t gpu_agent,
     hsa_amd_memory_pool_t pm4_ib_pool, uint32_t aql_queue_capacity,
     iree_hal_amdgpu_host_queue_t* out_queue) {
   IREE_TRACE_ZONE_BEGIN(z0);
   IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, aql_queue_capacity);
   iree_host_size_t pm4_ib_size = 0;
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
       z0, IREE_STRUCT_LAYOUT(
               0, &pm4_ib_size,
               IREE_STRUCT_FIELD(aql_queue_capacity,
                                 iree_hal_amdgpu_pm4_ib_slot_t, NULL)));
   if (IREE_UNLIKELY(!pm4_ib_pool.handle)) {
     IREE_RETURN_AND_END_ZONE_IF_ERROR(
         z0, iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
                              "PM4 IB memory pool is required"));
   }
   IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, pm4_ib_size);
   iree_hal_amdgpu_pm4_ib_slot_t* pm4_ib_slots = NULL;
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
       z0, iree_hsa_amd_memory_pool_allocate(
               IREE_LIBHSA(libhsa), pm4_ib_pool, pm4_ib_size,
               HSA_AMD_MEMORY_POOL_EXECUTABLE_FLAG, (void**)&pm4_ib_slots));
   iree_status_t status = iree_hsa_amd_agents_allow_access(
       IREE_LIBHSA(libhsa), /*num_agents=*/1, &gpu_agent, /*flags=*/NULL,
       pm4_ib_slots);
   if (iree_status_is_ok(status)) {
     memset(pm4_ib_slots, 0, pm4_ib_size);
     out_queue->pm4_ib_slots = pm4_ib_slots;
   } else {
     status = iree_status_join(status, iree_hsa_amd_memory_pool_free(
                                           IREE_LIBHSA(libhsa), pm4_ib_slots));
   }
   IREE_TRACE_ZONE_END(z0);
   return status;
 }

 static void iree_hal_amdgpu_host_queue_reclaim_retired(
     iree_hal_amdgpu_reclaim_entry_t* entry, uint64_t epoch, void* user_data) {
   (void)epoch;
   iree_hal_amdgpu_host_queue_t* queue =
       (iree_hal_amdgpu_host_queue_t*)user_data;
   iree_hal_amdgpu_profile_dispatch_event_reservation_t reservation = {
       .first_event_position = entry->profile_event_first_position,
       .event_count = entry->profile_event_count,
   };
   iree_hal_amdgpu_host_queue_retire_profile_dispatch_events(queue, reservation);
   iree_hal_amdgpu_profile_queue_device_event_reservation_t
       queue_device_reservation = {
           .first_event_position = entry->queue_device_event_first_position,
           .event_count = entry->queue_device_event_count,
       };
   iree_hal_amdgpu_host_queue_retire_profile_queue_device_events(
       queue, queue_device_reservation);
 }

 //===----------------------------------------------------------------------===//
 // Initialization / deinitialization
 //===----------------------------------------------------------------------===//

 void iree_hal_amdgpu_host_queue_enqueue_post_drain_action(
     iree_hal_amdgpu_host_queue_t* queue,
     iree_hal_amdgpu_host_queue_post_drain_action_t* action,
     iree_hal_amdgpu_host_queue_post_drain_fn_t fn, void* user_data) {
   action->next = NULL;
   action->fn = fn;
   action->user_data = user_data;

   iree_slim_mutex_lock(&queue->locks.post_drain_mutex);
   if (queue->post_drain.tail) {
     queue->post_drain.tail->next = action;
   } else {
     queue->post_drain.head = action;
   }
   queue->post_drain.tail = action;
   iree_slim_mutex_unlock(&queue->locks.post_drain_mutex);
 }

 static void iree_hal_amdgpu_host_queue_run_post_drain_actions(
     iree_hal_amdgpu_host_queue_t* queue) {
   iree_slim_mutex_lock(&queue->locks.post_drain_mutex);
   iree_hal_amdgpu_host_queue_post_drain_action_t* action =
       queue->post_drain.head;
   queue->post_drain.head = NULL;
   queue->post_drain.tail = NULL;
   iree_slim_mutex_unlock(&queue->locks.post_drain_mutex);

   while (action) {
     iree_hal_amdgpu_host_queue_post_drain_action_t* next_action = action->next;
     action->next = NULL;
     action->fn(action->user_data);
     action = next_action;
   }
 }

 // Drains completed notification entries and reclaims kernarg space. If the GPU
 // queue has faulted (error_status is set), fails all pending entries instead of
 // draining normally.
 static iree_host_size_t iree_hal_amdgpu_host_queue_drain_completions(
     iree_hal_amdgpu_host_queue_t* queue) {
   // Check for GPU queue error (set by the HSA error callback on another
   // thread). If the queue has faulted, no further epochs will advance;
   // fail all pending entries so waiters get the actual GPU error instead
   // of hanging or timing out.
   iree_status_t error = (iree_status_t)iree_atomic_load(
       &queue->error_status, iree_memory_order_acquire);
   const uint64_t previous_epoch = (uint64_t)iree_atomic_load(
       &queue->notification_ring.epoch.last_drained, iree_memory_order_relaxed);
   uint64_t kernarg_reclaim_position = 0;
   iree_host_size_t count = 0;
   if (IREE_UNLIKELY(error)) {
     count = iree_hal_amdgpu_notification_ring_fail_all(
         &queue->notification_ring, error, &kernarg_reclaim_position);
     iree_hal_amdgpu_host_queue_clear_profile_events(queue);
     iree_async_frontier_tracker_fail_axis(
         queue->frontier_tracker, queue->axis,
         iree_status_from_code(iree_status_code(error)));
   } else {
     count = iree_hal_amdgpu_notification_ring_drain(
         &queue->notification_ring,
         /*fallback_frontier=*/NULL, iree_hal_amdgpu_host_queue_reclaim_retired,
         queue, &kernarg_reclaim_position);
     const uint64_t current_epoch =
         (uint64_t)iree_atomic_load(&queue->notification_ring.epoch.last_drained,
                                    iree_memory_order_acquire);
     if (current_epoch > previous_epoch) {
       iree_async_frontier_tracker_advance(queue->frontier_tracker, queue->axis,
                                           current_epoch);
     }
   }
   if (kernarg_reclaim_position > 0) {
     iree_hal_amdgpu_kernarg_ring_reclaim(&queue->kernarg_ring,
                                          kernarg_reclaim_position);
   }
   iree_hal_amdgpu_host_queue_run_post_drain_actions(queue);
   return count;
 }

 static bool iree_hal_amdgpu_host_queue_has_error(
     iree_hal_amdgpu_host_queue_t* queue) {
   return iree_atomic_load(&queue->error_status, iree_memory_order_acquire) != 0;
 }

 static bool iree_hal_amdgpu_host_queue_store_error(
     iree_hal_amdgpu_host_queue_t* queue, iree_status_t error) {
   intptr_t expected = 0;
   if (iree_atomic_compare_exchange_strong(
           &queue->error_status, &expected, (intptr_t)error,
           iree_memory_order_release, iree_memory_order_acquire)) {
     return true;
   }
   iree_status_free(error);
   return false;
 }

 static void iree_hal_amdgpu_host_queue_request_completion_thread_stop(
     iree_hal_amdgpu_host_queue_t* queue) {
   if (queue->completion.stop_signal.handle) {
     iree_hsa_signal_store_screlease(IREE_LIBHSA(queue->libhsa),
                                     queue->completion.stop_signal, 1);
   }
 }

 static hsa_signal_value_t iree_hal_amdgpu_host_queue_last_drained_signal_value(
     iree_hal_amdgpu_host_queue_t* queue) {
   const uint64_t last_drained_epoch = (uint64_t)iree_atomic_load(
       &queue->notification_ring.epoch.last_drained, iree_memory_order_acquire);
   return (hsa_signal_value_t)(IREE_HAL_AMDGPU_EPOCH_INITIAL_VALUE -
                               last_drained_epoch);
 }

 // Completion thread entry point. Blocks in HSA until either the queue epoch
 // signal changes or teardown/error signals the stop signal. Completion wakeups
 // drain normally; stop/error wakeups perform one final drain/fail before exit.
 static int iree_hal_amdgpu_host_queue_completion_thread_main(void* entry_arg) {
   {
     IREE_TRACE_ZONE_BEGIN_NAMED(
         z0, "iree_hal_amdgpu_host_queue_completion_thread_start");
     IREE_TRACE_ZONE_END(z0);
   }
   iree_hal_amdgpu_host_queue_t* queue =
       (iree_hal_amdgpu_host_queue_t*)entry_arg;

   enum {
     IREE_HAL_AMDGPU_COMPLETION_WAIT_EPOCH_SIGNAL = 0,
     IREE_HAL_AMDGPU_COMPLETION_WAIT_STOP_SIGNAL = 1,
     IREE_HAL_AMDGPU_COMPLETION_WAIT_SIGNAL_COUNT = 2,
   };

   hsa_signal_t epoch_signal =
       iree_hal_amdgpu_notification_ring_epoch_signal(&queue->notification_ring);
   hsa_signal_t stop_signal = queue->completion.stop_signal;
   hsa_signal_value_t last_epoch_value =
       iree_hal_amdgpu_host_queue_last_drained_signal_value(queue);

   bool keep_running = true;
   while (keep_running) {
     hsa_signal_t signals[IREE_HAL_AMDGPU_COMPLETION_WAIT_SIGNAL_COUNT] = {
         epoch_signal,
         stop_signal,
     };
     hsa_signal_condition_t
         conditions[IREE_HAL_AMDGPU_COMPLETION_WAIT_SIGNAL_COUNT] = {
             HSA_SIGNAL_CONDITION_NE,
             HSA_SIGNAL_CONDITION_NE,
         };
     hsa_signal_value_t values[IREE_HAL_AMDGPU_COMPLETION_WAIT_SIGNAL_COUNT] = {
         last_epoch_value,
         0,
     };
     const uint32_t signal_index = iree_hsa_amd_signal_wait_any(
         IREE_LIBHSA(queue->libhsa),
         IREE_HAL_AMDGPU_COMPLETION_WAIT_SIGNAL_COUNT, signals, conditions,
         values, UINT64_MAX, HSA_WAIT_STATE_BLOCKED,
         /*satisfying_value=*/NULL);

     {
       IREE_TRACE_ZONE_BEGIN_NAMED(
           z0, "iree_hal_amdgpu_host_queue_completion_thread_pump");
       IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, signal_index);

       if (signal_index == IREE_HAL_AMDGPU_COMPLETION_WAIT_EPOCH_SIGNAL) {
         iree_hal_amdgpu_host_queue_drain_completions(queue);
         // Arm the next wait from the epoch we actually drained, not from a raw
         // HSA signal load. A GPU completion can race with the drain and update
         // the signal after drain() sampled it; observing that newer value here
         // would mark an undrained epoch as already seen and could sleep forever
         // with a user semaphore still pending.
         last_epoch_value =
             iree_hal_amdgpu_host_queue_last_drained_signal_value(queue);
       }

       if (signal_index == IREE_HAL_AMDGPU_COMPLETION_WAIT_STOP_SIGNAL ||
           iree_hal_amdgpu_host_queue_has_error(queue)) {
         iree_hal_amdgpu_host_queue_drain_completions(queue);
         keep_running = false;
       } else if (IREE_UNLIKELY(signal_index >=
                                IREE_HAL_AMDGPU_COMPLETION_WAIT_SIGNAL_COUNT)) {
         iree_status_t error = iree_make_status(
             IREE_STATUS_INTERNAL,
             "hsa_amd_signal_wait_any returned invalid signal index %u",
             signal_index);
         iree_hal_amdgpu_host_queue_store_error(queue, error);
         iree_hal_amdgpu_host_queue_drain_completions(queue);
         keep_running = false;
       }

       IREE_TRACE_ZONE_END(z0);
     }
   }

   {
     IREE_TRACE_ZONE_BEGIN_NAMED(
         z0, "iree_hal_amdgpu_host_queue_completion_thread_exit");
     IREE_TRACE_ZONE_END(z0);
   }
   return 0;
 }

 // HSA queue error callback. Called by the HSA runtime (on an internal thread)
 // when the queue encounters an unrecoverable error (page fault, invalid AQL
 // packet, ECC error). Stores the error atomically on the queue so the
 // completion thread can fail pending semaphores with the actual GPU error.
 static void iree_hal_amdgpu_host_queue_error_callback(hsa_status_t status,
                                                       hsa_queue_t* source,
                                                       void* data) {
   iree_hal_amdgpu_host_queue_t* queue = (iree_hal_amdgpu_host_queue_t*)data;

   // Convert the HSA error to an IREE status with diagnostic information.
   iree_status_t error = iree_status_from_hsa_status(
       __FILE__, __LINE__, status, "hsa_queue_error_callback",
       "GPU queue encountered an unrecoverable error");

   // First-error-wins: store the error with release semantics so the status
   // payload (heap-allocated string, backtrace) is visible to any thread that
   // loads with acquire. If another error already won the race, free ours.
   if (iree_hal_amdgpu_host_queue_store_error(queue, error)) {
     iree_hal_amdgpu_host_queue_request_completion_thread_stop(queue);
   }
 }

 iree_status_t iree_hal_amdgpu_host_queue_initialize(
     const iree_hal_amdgpu_libhsa_t* libhsa, iree_hal_device_t* logical_device,
     iree_async_proactor_t* proactor, hsa_agent_t gpu_agent,
     const iree_hal_amdgpu_kernarg_ring_memory_t* kernarg_memory,
     hsa_amd_memory_pool_t pm4_ib_pool,
     iree_async_frontier_tracker_t* frontier_tracker, iree_async_axis_t axis,
     iree_hal_queue_affinity_t queue_affinity,
     iree_thread_affinity_t completion_thread_affinity,
     iree_hal_amdgpu_wait_barrier_strategy_t wait_barrier_strategy,
     iree_hal_amdgpu_vendor_packet_capability_flags_t vendor_packet_capabilities,
     iree_hal_amdgpu_epoch_signal_table_t* epoch_table,
     iree_arena_block_pool_t* block_pool,
     iree_hal_amdgpu_block_pool_t* profiling_signal_block_pool,
     const iree_hal_amdgpu_device_buffer_transfer_context_t* transfer_context,
     const iree_hal_pool_set_t* default_pool_set, iree_hal_pool_t* default_pool,
     iree_hal_amdgpu_transient_buffer_pool_t* transient_buffer_pool,
     iree_hal_amdgpu_staging_pool_t* staging_pool,
     iree_host_size_t device_ordinal, uint32_t aql_queue_capacity,
     uint32_t notification_capacity, uint32_t kernarg_capacity_in_blocks,
     iree_allocator_t host_allocator, iree_hal_amdgpu_host_queue_t* out_queue) {
   IREE_ASSERT_ARGUMENT(libhsa);
   IREE_ASSERT_ARGUMENT(logical_device);
   IREE_ASSERT_ARGUMENT(proactor);
   IREE_ASSERT_ARGUMENT(kernarg_memory);
   IREE_ASSERT_ARGUMENT(frontier_tracker);
   IREE_ASSERT_ARGUMENT(epoch_table);
   IREE_ASSERT_ARGUMENT(block_pool);
   IREE_ASSERT_ARGUMENT(profiling_signal_block_pool);
   IREE_ASSERT_ARGUMENT(transfer_context);
   IREE_ASSERT_ARGUMENT(default_pool_set);
   IREE_ASSERT_ARGUMENT(default_pool);
   IREE_ASSERT_ARGUMENT(transient_buffer_pool);
   IREE_ASSERT_ARGUMENT(out_queue);

   if (!iree_host_size_is_power_of_two(aql_queue_capacity) ||
       !iree_host_size_is_power_of_two(notification_capacity) ||
       !iree_host_size_is_power_of_two(kernarg_capacity_in_blocks)) {
     return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
                             "all capacities must be powers of two");
   }
   if (kernarg_capacity_in_blocks / 2u < aql_queue_capacity) {
     return iree_make_status(
         IREE_STATUS_INVALID_ARGUMENT,
         "kernarg ring capacity must be at least 2x the AQL ring capacity "
         "to cover one tail-padding gap at wrap (got kernarg_blocks=%u, "
         "aql_packets=%u)",
         kernarg_capacity_in_blocks, aql_queue_capacity);
   }

   IREE_TRACE_ZONE_BEGIN(z0);

   memset(out_queue, 0, sizeof(*out_queue));
   out_queue->base.vtable = &iree_hal_amdgpu_host_queue_vtable;
   out_queue->libhsa = libhsa;
   out_queue->logical_device = logical_device;
   out_queue->proactor = proactor;
   out_queue->frontier_tracker = frontier_tracker;
   out_queue->host_allocator = host_allocator;

   // Submission pipeline state.
   iree_slim_mutex_initialize(&out_queue->locks.submission_mutex);
   iree_slim_mutex_initialize(&out_queue->locks.post_drain_mutex);
   iree_slim_mutex_initialize(&out_queue->profiling.event_mutex);
   out_queue->profiling.signals.block_pool = profiling_signal_block_pool;
   out_queue->axis = axis;
   out_queue->wait_barrier_strategy = wait_barrier_strategy;
   out_queue->vendor_packet_capabilities = vendor_packet_capabilities;
   out_queue->queue_affinity = queue_affinity;
   out_queue->last_signal.semaphore = NULL;
   out_queue->last_signal.epoch = 0;
   out_queue->block_pool = block_pool;
   out_queue->can_publish_frontier = true;
   out_queue->transfer_context = transfer_context;
   out_queue->default_pool_set = default_pool_set;
   out_queue->default_pool = default_pool;
   out_queue->transient_buffer_pool = transient_buffer_pool;
   out_queue->staging_pool = staging_pool;
   out_queue->device_ordinal = device_ordinal;
   out_queue->pending_head = NULL;
   iree_async_frontier_initialize(iree_hal_amdgpu_host_queue_frontier(out_queue),
                                  /*entry_count=*/0);

   // The optional tracker semaphore is an iree_async_semaphore_t bridge for
   // CPU-side wait integration. The queue's GPU-visible HSA epoch signal is
   // created by the notification ring below and registered in the epoch table.
   iree_status_t status = iree_async_frontier_tracker_register_axis(
       frontier_tracker, axis, /*semaphore=*/NULL);

   // Create the host-only stop signal before the hardware queue so the HSA error
   // callback always has a valid signal to wake if queue creation races with an
   // asynchronous fault.
   if (iree_status_is_ok(status)) {
     status = iree_hsa_amd_signal_create(
         IREE_LIBHSA(libhsa), /*initial_value=*/0,
         /*num_consumers=*/0, /*consumers=*/NULL, /*attributes=*/0,
         &out_queue->completion.stop_signal);
   }

   // Create the HSA hardware AQL queue.
   //
   // HSA_QUEUE_TYPE_MULTI is required (not just an optimization). Once command
   // buffers start performing device-side enqueue, the CP itself becomes a
   // concurrent producer alongside the host submission path, so the queue must
   // permit multiple concurrent producers. The host-side reserve already uses
   // an atomic fetch_add on the write index, which is well-defined only on
   // MULTI queues.
   hsa_queue_t* hardware_queue = NULL;
   if (iree_status_is_ok(status)) {
     status = iree_hsa_queue_create(
         IREE_LIBHSA(libhsa), gpu_agent, aql_queue_capacity,
         HSA_QUEUE_TYPE_MULTI, iree_hal_amdgpu_host_queue_error_callback,
         /*data=*/out_queue,
         /*private_segment_size=*/UINT32_MAX,
         /*group_segment_size=*/UINT32_MAX, &hardware_queue);
   }

   // Initialize the AQL ring from the hardware queue.
   if (iree_status_is_ok(status)) {
     out_queue->hardware_queue = hardware_queue;
     iree_hal_amdgpu_aql_ring_initialize((iree_amd_queue_t*)hardware_queue,
                                         &out_queue->aql_ring);
   }

   // Initialize the kernarg ring from the selected HSA memory pool.
   if (iree_status_is_ok(status)) {
     status = iree_hal_amdgpu_kernarg_ring_initialize(libhsa, kernarg_memory,
                                                      kernarg_capacity_in_blocks,
                                                      &out_queue->kernarg_ring);
   }

   // Initialize the optional PM4 IB slot buffer. Capability-driven allocation
   // keeps dynamic PM4 storage available on CDNA queues that use BARRIER_VALUE
   // for waits but still support AQL PM4-IB snippets for other features. The
   // buffer is indexed by AQL packet id and inherits AQL ring
   // backpressure/reuse; there is no separate PM4 producer or reclaim position.
   if (iree_status_is_ok(status) &&
       (vendor_packet_capabilities &
        IREE_HAL_AMDGPU_VENDOR_PACKET_CAPABILITY_AQL_PM4_IB)) {
     status = iree_hal_amdgpu_host_queue_allocate_pm4_ib_slots(
         libhsa, gpu_agent, pm4_ib_pool, aql_queue_capacity, out_queue);
   }

   // Initialize the notification ring (creates epoch signal + entry buffer).
   if (iree_status_is_ok(status)) {
     status = iree_hal_amdgpu_notification_ring_initialize(
         libhsa, block_pool, notification_capacity, host_allocator,
         &out_queue->notification_ring);
   }

   // Register this queue's epoch signal in the shared table for cross-queue
   // barrier emission lookups. Must happen after notification ring init (which
   // creates the epoch signal) and before any submissions.
   if (iree_status_is_ok(status)) {
     iree_hal_amdgpu_epoch_signal_table_register(
         epoch_table, iree_async_axis_device_index(axis),
         iree_async_axis_queue_index(axis),
         iree_hal_amdgpu_notification_ring_epoch_signal(
             &out_queue->notification_ring));
     out_queue->epoch_table = epoch_table;
   }

   if (iree_status_is_ok(status)) {
     iree_thread_create_params_t thread_params;
     memset(&thread_params, 0, sizeof(thread_params));
     char thread_name[32] = {0};
     snprintf(thread_name, IREE_ARRAYSIZE(thread_name),
              "iree-hal-amdgpu-l0p%uq%u-complete",
              (unsigned)iree_async_axis_device_index(axis),
              (unsigned)iree_async_axis_queue_index(axis));
     thread_params.name = iree_make_cstring_view(thread_name);
     thread_params.initial_affinity = completion_thread_affinity;
     status = iree_thread_create(
         iree_hal_amdgpu_host_queue_completion_thread_main, out_queue,
         thread_params, host_allocator, &out_queue->completion.thread);
   }
   if (!iree_status_is_ok(status)) {
     iree_hal_amdgpu_host_queue_deinitialize(out_queue);
   }

   IREE_TRACE_ZONE_END(z0);
   return status;
 }

 void iree_hal_amdgpu_host_queue_deinitialize(
     iree_hal_amdgpu_host_queue_t* queue) {
   IREE_ASSERT_ARGUMENT(queue);
   IREE_TRACE_ZONE_BEGIN(z0);

   iree_slim_mutex_lock(&queue->locks.submission_mutex);
   queue->is_shutting_down = true;
   iree_slim_mutex_unlock(&queue->locks.submission_mutex);

   if (queue->completion.thread) {
     iree_hal_amdgpu_host_queue_request_completion_thread_stop(queue);
     // There is only one owner for the thread, so this also joins the thread.
     iree_thread_release(queue->completion.thread);
     queue->completion.thread = NULL;
   }

   // Destroy the hardware queue before the remaining host-side resources so the
   // HSA runtime cannot race a late error callback against signal teardown.
   if (queue->hardware_queue) {
     iree_hal_amdgpu_hsa_cleanup_assert_success(
         iree_hsa_queue_destroy_raw(queue->libhsa, queue->hardware_queue));
     queue->hardware_queue = NULL;
   }

   // Capacity-parked pending ops are retried by post-drain callbacks. Flush
   // those callbacks under shutdown first so they observe cancellation and own
   // their normal failure path instead of being destroyed out from under the
   // callback storage.
   iree_hal_amdgpu_host_queue_run_post_drain_actions(queue);

   // Cancel all pending (deferred) operations. Their signal semaphores are
   // failed with CANCELLED so downstream waiters don't hang.
   if (queue->pending_head) {
     iree_hal_amdgpu_host_queue_cancel_pending(queue, IREE_STATUS_CANCELLED,
                                               "queue shutting down");
   }

   // Process any remaining notification entries before destroying resources.
   // If the GPU faulted, fail all pending entries so waiters get the actual
   // error. Otherwise drain normally (entries completed but not yet processed).
   iree_status_t error = (iree_status_t)iree_atomic_load(
       &queue->error_status, iree_memory_order_acquire);
   uint64_t kernarg_reclaim_position = 0;
   if (!iree_status_is_ok(error)) {
     iree_hal_amdgpu_notification_ring_fail_all(&queue->notification_ring, error,
                                                &kernarg_reclaim_position);
     iree_hal_amdgpu_host_queue_clear_profile_events(queue);
     iree_status_free(error);
   } else {
     iree_hal_amdgpu_notification_ring_drain(
         &queue->notification_ring,
         /*fallback_frontier=*/NULL, iree_hal_amdgpu_host_queue_reclaim_retired,
         queue, &kernarg_reclaim_position);
   }
   if (kernarg_reclaim_position > 0) {
     iree_hal_amdgpu_kernarg_ring_reclaim(&queue->kernarg_ring,
                                          kernarg_reclaim_position);
   }
   iree_hal_amdgpu_host_queue_run_post_drain_actions(queue);

   // Deregister from the epoch signal table before destroying the notification
   // ring (which owns the epoch signal). Guarded by epoch_table != NULL to
   // handle partial initialization (init failed before registration).
   if (queue->epoch_table) {
     iree_hal_amdgpu_epoch_signal_table_deregister(
         queue->epoch_table, iree_async_axis_device_index(queue->axis),
         iree_async_axis_queue_index(queue->axis));
     queue->epoch_table = NULL;
   }

   if (queue->frontier_tracker) {
     iree_async_frontier_tracker_retire_axis(
         queue->frontier_tracker, queue->axis,
         iree_status_from_code(IREE_STATUS_CANCELLED));
     queue->frontier_tracker = NULL;
     queue->axis = 0;
   }

   iree_hal_amdgpu_notification_ring_deinitialize(&queue->notification_ring);

   iree_hal_amdgpu_kernarg_ring_deinitialize(queue->libhsa,
                                             &queue->kernarg_ring);

   if (queue->pm4_ib_slots) {
     iree_hal_amdgpu_hsa_cleanup_assert_success(
         iree_hsa_amd_memory_pool_free_raw(queue->libhsa, queue->pm4_ib_slots));
     queue->pm4_ib_slots = NULL;
   }

   iree_hal_amdgpu_host_queue_deallocate_profiling_completion_signals(queue);
   iree_hal_amdgpu_host_queue_deallocate_profile_events(queue);

   if (queue->command_buffer_scratch) {
     iree_allocator_free(queue->host_allocator, queue->command_buffer_scratch);
     queue->command_buffer_scratch = NULL;
   }

   if (queue->completion.stop_signal.handle) {
     iree_hal_amdgpu_hsa_cleanup_assert_success(iree_hsa_signal_destroy_raw(
         queue->libhsa, queue->completion.stop_signal));
     queue->completion.stop_signal.handle = 0;
   }

   iree_slim_mutex_deinitialize(&queue->locks.post_drain_mutex);
   iree_slim_mutex_deinitialize(&queue->profiling.event_mutex);
   iree_slim_mutex_deinitialize(&queue->locks.submission_mutex);

   IREE_TRACE_ZONE_END(z0);
 }

 iree_status_t iree_hal_amdgpu_host_queue_set_hsa_profiling_enabled(
     iree_hal_amdgpu_host_queue_t* queue, bool enabled) {
   IREE_ASSERT_ARGUMENT(queue);
   IREE_TRACE_ZONE_BEGIN(z0);
   IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, enabled ? 1 : 0);

   if (enabled) {
     IREE_RETURN_AND_END_ZONE_IF_ERROR(
         z0, iree_hal_amdgpu_host_queue_ensure_profile_event_storage(queue));
     IREE_RETURN_AND_END_ZONE_IF_ERROR(
         z0,
         iree_hal_amdgpu_host_queue_ensure_profiling_completion_signals(queue));
     iree_hal_amdgpu_host_queue_clear_profile_events(queue);
   }

   iree_status_t status = iree_hsa_amd_profiling_set_profiler_enabled(
       IREE_LIBHSA(queue->libhsa), queue->hardware_queue, enabled ? 1 : 0);
   if (iree_status_is_ok(status)) {
     queue->profiling.hsa_queue_timestamps_enabled = enabled ? 1 : 0;
   }

   IREE_TRACE_ZONE_END(z0);
   return status;
 }

 static void iree_hal_amdgpu_host_queue_trim(
     iree_hal_amdgpu_virtual_queue_t* base_queue) {}

 //===----------------------------------------------------------------------===//
 // Queue operations
 //===----------------------------------------------------------------------===//

 typedef struct iree_hal_amdgpu_host_queue_op_submission_t {
   // Queue whose submission_mutex is held between begin/end.
   iree_hal_amdgpu_host_queue_t* queue;

   // Wait resolution computed while holding submission_mutex.
   iree_hal_amdgpu_wait_resolution_t resolution;

   // Deferred operation captured while holding submission_mutex, if any.
   iree_hal_amdgpu_pending_op_t* deferred_op;

   // Number of input waits. Capacity retries only need post-drain resubmission
   // when no semantic waits are available to naturally re-enter the queue.
   iree_host_size_t wait_semaphore_count;

   // Whether the direct submit helper found enough queue capacity.
   bool ready;

   // Whether |deferred_op| should retry on the completion thread after drain.
   bool wait_for_capacity;
 } iree_hal_amdgpu_host_queue_op_submission_t;

 // Begins one direct/deferred queue operation attempt. The caller must pair this
 // with iree_hal_amdgpu_host_queue_op_submission_end exactly once.
 static inline void iree_hal_amdgpu_host_queue_op_submission_begin(
     iree_hal_amdgpu_host_queue_t* queue,
     const iree_hal_semaphore_list_t wait_semaphore_list,
     iree_hal_amdgpu_host_queue_op_submission_t* out_submission) {
   out_submission->queue = queue;
   out_submission->deferred_op = NULL;
   out_submission->wait_semaphore_count = wait_semaphore_list.count;
   out_submission->ready = true;
   out_submission->wait_for_capacity = false;

   iree_slim_mutex_lock(&queue->locks.submission_mutex);
   iree_hal_amdgpu_host_queue_resolve_waits(queue, wait_semaphore_list,
                                            &out_submission->resolution);
 }

 // Marks a captured pending op as retrying after completion-thread drain because
 // direct submission ran out of queue capacity.
 static inline void iree_hal_amdgpu_host_queue_op_submission_defer_for_capacity(
     iree_hal_amdgpu_host_queue_op_submission_t* submission) {
   submission->wait_for_capacity = submission->wait_semaphore_count == 0;
 }

 // Ends one direct/deferred queue operation attempt by releasing
 // submission_mutex and starting any captured pending op outside the lock.
 static inline iree_status_t iree_hal_amdgpu_host_queue_op_submission_end(
     iree_hal_amdgpu_host_queue_op_submission_t* submission,
     iree_status_t status) {
   iree_slim_mutex_unlock(&submission->queue->locks.submission_mutex);

   if (iree_status_is_ok(status) && submission->deferred_op) {
     status = iree_hal_amdgpu_pending_op_start(submission->deferred_op,
                                               submission->wait_for_capacity);
   }
   return status;
 }

 static iree_status_t iree_hal_amdgpu_host_queue_signal_empty_barrier(
     iree_hal_amdgpu_host_queue_t* queue,
     const iree_hal_semaphore_list_t signal_semaphore_list) {
   iree_slim_mutex_lock(&queue->locks.submission_mutex);
   iree_status_t status = iree_ok_status();
   if (IREE_UNLIKELY(queue->is_shutting_down)) {
     status = iree_make_status(IREE_STATUS_CANCELLED, "queue shutting down");
   }
   iree_slim_mutex_unlock(&queue->locks.submission_mutex);

   if (iree_status_is_ok(status)) {
     // Signal outside submission_mutex: semaphore signaling dispatches satisfied
     // timepoints, and those callbacks may submit additional queue work.
     status = iree_hal_semaphore_list_signal(signal_semaphore_list,
                                             /*frontier=*/NULL);
   }
   return status;
 }

 static iree_status_t iree_hal_amdgpu_host_queue_execute(
     iree_hal_amdgpu_virtual_queue_t* base_queue,
     const iree_hal_semaphore_list_t wait_semaphore_list,
     const iree_hal_semaphore_list_t signal_semaphore_list,
     iree_hal_command_buffer_t* command_buffer,
     iree_hal_buffer_binding_table_t binding_table,
     iree_hal_execute_flags_t flags) {
   iree_hal_amdgpu_host_queue_t* queue =
       (iree_hal_amdgpu_host_queue_t*)base_queue;

   IREE_RETURN_IF_ERROR(
       iree_hal_amdgpu_host_queue_validate_execute_flags(flags));

   if (!command_buffer && wait_semaphore_list.count == 0) {
     if (IREE_UNLIKELY(binding_table.count != 0)) {
       return iree_make_status(
           IREE_STATUS_INVALID_ARGUMENT,
           "barrier-only queue_execute must not provide a binding table "
           "(count=%" PRIhsz ")",
           binding_table.count);
     }
     return iree_hal_amdgpu_host_queue_signal_empty_barrier(
         queue, signal_semaphore_list);
   }

   iree_hal_amdgpu_host_queue_op_submission_t submission;
   iree_hal_amdgpu_host_queue_op_submission_begin(queue, wait_semaphore_list,
                                                  &submission);
   iree_status_t status = iree_ok_status();
   if (submission.resolution.needs_deferral) {
     status = iree_hal_amdgpu_host_queue_defer_execute(
         queue, &wait_semaphore_list, &signal_semaphore_list, command_buffer,
         binding_table, flags, &submission.deferred_op);
   } else if (!command_buffer) {
     if (IREE_UNLIKELY(binding_table.count != 0)) {
       status = iree_make_status(
           IREE_STATUS_INVALID_ARGUMENT,
           "barrier-only queue_execute must not provide a binding table "
           "(count=%" PRIhsz ")",
           binding_table.count);
     } else {
       uint64_t submission_id = 0;
       iree_hal_amdgpu_host_queue_profile_event_info_t profile_event_info = {
           .type = IREE_HAL_PROFILE_QUEUE_EVENT_TYPE_BARRIER,
           .operation_count = 0,
       };
       status = iree_hal_amdgpu_host_queue_try_submit_barrier(
           queue, &submission.resolution, signal_semaphore_list,
           (iree_hal_amdgpu_reclaim_action_t){0},
           /*operation_resources=*/NULL,
           /*operation_resource_count=*/0, &profile_event_info,
           iree_hal_amdgpu_host_queue_post_commit_callback_null(),
           /*resource_set=*/NULL,
           IREE_HAL_AMDGPU_HOST_QUEUE_SUBMISSION_FLAG_RETAIN_RESOURCES,
           &submission.ready, &submission_id);
       if (iree_status_is_ok(status) && submission.ready) {
         profile_event_info.submission_id = submission_id;
         iree_hal_amdgpu_host_queue_record_profile_queue_event(
             queue, &submission.resolution, signal_semaphore_list,
             &profile_event_info);
       }
       if (iree_status_is_ok(status) && !submission.ready) {
         status = iree_hal_amdgpu_host_queue_defer_execute(
             queue, &wait_semaphore_list, &signal_semaphore_list,
             /*command_buffer=*/NULL, iree_hal_buffer_binding_table_empty(),
             flags, &submission.deferred_op);
         iree_hal_amdgpu_host_queue_op_submission_defer_for_capacity(
             &submission);
       }
     }
   } else {
     iree_hal_resource_set_t* binding_resource_set = NULL;
     status = iree_hal_amdgpu_host_queue_submit_command_buffer(
         queue, &submission.resolution, signal_semaphore_list, command_buffer,
         binding_table, flags, &binding_resource_set, &submission.ready);
     if (iree_status_is_ok(status) && !submission.ready) {
       iree_hal_resource_set_free(binding_resource_set);
       status = iree_hal_amdgpu_host_queue_defer_execute(
           queue, &wait_semaphore_list, &signal_semaphore_list, command_buffer,
           binding_table, flags, &submission.deferred_op);
       iree_hal_amdgpu_host_queue_op_submission_defer_for_capacity(&submission);
     } else if (!iree_status_is_ok(status)) {
       iree_hal_resource_set_free(binding_resource_set);
     }
   }
   return iree_hal_amdgpu_host_queue_op_submission_end(&submission, status);
 }

 static iree_status_t iree_hal_amdgpu_host_queue_alloca(
     iree_hal_amdgpu_virtual_queue_t* base_queue,
     const iree_hal_semaphore_list_t wait_semaphore_list,
     const iree_hal_semaphore_list_t signal_semaphore_list,
     iree_hal_pool_t* pool, iree_hal_buffer_params_t params,
     iree_device_size_t allocation_size, iree_hal_alloca_flags_t flags,
     iree_hal_buffer_t** IREE_RESTRICT out_buffer) {
   IREE_ASSERT_ARGUMENT(out_buffer);
   *out_buffer = NULL;

   iree_hal_amdgpu_host_queue_t* queue =
       (iree_hal_amdgpu_host_queue_t*)base_queue;

   iree_hal_pool_t* allocation_pool = NULL;
   iree_hal_buffer_t* buffer = NULL;
   IREE_RETURN_IF_ERROR(iree_hal_amdgpu_host_queue_prepare_alloca_wrapper(
       queue, pool, &params, allocation_size, flags, &allocation_pool, &buffer));
   // Always ask the pool to surface waitable death-frontier candidates so the
   // queue can distinguish true pool pressure from a dependency the caller did
   // not authorize. The HAL alloca flag is checked before consuming any
   // OK_NEEDS_WAIT reservation. Disallow growth while submission_mutex is held;
   // growable pools report that as a cold retry instead of calling into their
   // slab provider on the serialized queue path.
   const iree_hal_pool_reserve_flags_t reserve_flags =
       IREE_HAL_POOL_RESERVE_FLAG_ALLOW_WAIT_FRONTIER |
       IREE_HAL_POOL_RESERVE_FLAG_DISALLOW_GROWTH;

   iree_hal_amdgpu_host_queue_op_submission_t submission;
   iree_hal_amdgpu_host_queue_op_submission_begin(queue, wait_semaphore_list,
                                                  &submission);
   iree_status_t status = iree_ok_status();
   iree_hal_amdgpu_pending_op_t* memory_wait_op = NULL;
   if (submission.resolution.needs_deferral) {
     status = iree_hal_amdgpu_host_queue_defer_alloca(
         queue, &wait_semaphore_list, &signal_semaphore_list, allocation_pool,
         params, allocation_size, flags, reserve_flags, buffer,
         &submission.deferred_op);
   } else {
     status = iree_hal_amdgpu_host_queue_submit_alloca(
         queue, &submission.resolution, signal_semaphore_list, allocation_pool,
         params, allocation_size, flags, reserve_flags, buffer,
         IREE_HAL_AMDGPU_HOST_QUEUE_SUBMISSION_FLAG_RETAIN_RESOURCES,
         /*pending_op=*/NULL, &memory_wait_op, &submission.ready);
     if (iree_status_is_ok(status) && !submission.ready && !memory_wait_op) {
       status = iree_hal_amdgpu_host_queue_defer_alloca(
           queue, &wait_semaphore_list, &signal_semaphore_list, allocation_pool,
           params, allocation_size, flags, reserve_flags, buffer,
           &submission.deferred_op);
       iree_hal_amdgpu_host_queue_op_submission_defer_for_capacity(&submission);
     }
   }
   status = iree_hal_amdgpu_host_queue_op_submission_end(&submission, status);
   if (iree_status_is_ok(status) && memory_wait_op) {
     iree_hal_amdgpu_pending_op_enqueue_alloca_memory_wait(memory_wait_op);
   }

   if (iree_status_is_ok(status)) {
     *out_buffer = buffer;
   } else {
     iree_hal_buffer_release(buffer);
   }
   return status;
 }

 static iree_status_t iree_hal_amdgpu_host_queue_dealloca(
     iree_hal_amdgpu_virtual_queue_t* base_queue,
     const iree_hal_semaphore_list_t wait_semaphore_list,
     const iree_hal_semaphore_list_t signal_semaphore_list,
     iree_hal_buffer_t* buffer, iree_hal_dealloca_flags_t flags) {
   iree_hal_amdgpu_host_queue_t* queue =
       (iree_hal_amdgpu_host_queue_t*)base_queue;

   if (IREE_UNLIKELY(
           iree_any_bit_set(flags, ~(IREE_HAL_DEALLOCA_FLAG_NONE |
                                     IREE_HAL_DEALLOCA_FLAG_PREFER_ORIGIN)))) {
     return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
                             "unsupported dealloca flags: 0x%" PRIx64, flags);
   }

   // iree_hal_device_queue_dealloca() applies PREFER_ORIGIN before vtable
   // dispatch by rewriting the device and queue affinity from the buffer's
   // allocation placement. Transient wrappers created by queue_alloca carry this
   // queue's one-bit affinity in that placement, so this host-queue path can use
   // |base_queue| directly.
   if (!iree_hal_amdgpu_transient_buffer_isa(buffer)) {
     return iree_hal_amdgpu_host_queue_execute(
         base_queue, wait_semaphore_list, signal_semaphore_list,
         /*command_buffer=*/NULL, iree_hal_buffer_binding_table_empty(),
         IREE_HAL_EXECUTE_FLAG_NONE);
   }

   if (IREE_UNLIKELY(!iree_hal_amdgpu_transient_buffer_begin_dealloca(buffer))) {
     return iree_make_status(
         IREE_STATUS_FAILED_PRECONDITION,
         "transient buffer has already been queued for deallocation");
   }

   iree_hal_amdgpu_host_queue_op_submission_t submission;
   iree_hal_amdgpu_host_queue_op_submission_begin(queue, wait_semaphore_list,
                                                  &submission);
   iree_status_t status = iree_ok_status();
   if (submission.resolution.needs_deferral) {
     status = iree_hal_amdgpu_host_queue_defer_dealloca(
         queue, &wait_semaphore_list, &signal_semaphore_list, buffer,
         &submission.deferred_op);
   } else {
     status = iree_hal_amdgpu_host_queue_submit_dealloca(
         queue, &submission.resolution, signal_semaphore_list, buffer,
         IREE_HAL_AMDGPU_HOST_QUEUE_SUBMISSION_FLAG_RETAIN_RESOURCES,
         &submission.ready);
     if (iree_status_is_ok(status) && !submission.ready) {
       status = iree_hal_amdgpu_host_queue_defer_dealloca(
           queue, &wait_semaphore_list, &signal_semaphore_list, buffer,
           &submission.deferred_op);
       iree_hal_amdgpu_host_queue_op_submission_defer_for_capacity(&submission);
     }
   }
   status = iree_hal_amdgpu_host_queue_op_submission_end(&submission, status);
   if (!iree_status_is_ok(status)) {
     iree_hal_amdgpu_transient_buffer_abort_dealloca(buffer);
   }
   return status;
 }

 // Queue fill entry point. Resolves waits under submission_mutex and captures a
 // pending operation only when waits or submission capacity require deferral.
 static iree_status_t iree_hal_amdgpu_host_queue_fill(
     iree_hal_amdgpu_virtual_queue_t* base_queue,
     const iree_hal_semaphore_list_t wait_semaphore_list,
     const iree_hal_semaphore_list_t signal_semaphore_list,
     iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
     iree_device_size_t length, uint64_t pattern_bits,
     iree_host_size_t pattern_length, iree_hal_fill_flags_t flags) {
   iree_hal_amdgpu_host_queue_t* queue =
       (iree_hal_amdgpu_host_queue_t*)base_queue;

   iree_hal_amdgpu_host_queue_op_submission_t submission;
   iree_hal_amdgpu_host_queue_op_submission_begin(queue, wait_semaphore_list,
                                                  &submission);
   iree_status_t status = iree_ok_status();
   if (submission.resolution.needs_deferral) {
     status = iree_hal_amdgpu_host_queue_defer_fill(
         queue, &wait_semaphore_list, &signal_semaphore_list, target_buffer,
         target_offset, length, pattern_bits, pattern_length, flags,
         &submission.deferred_op);
   } else {
     status = iree_hal_amdgpu_host_queue_submit_fill(
         queue, &submission.resolution, signal_semaphore_list, target_buffer,
         target_offset, length, pattern_bits, pattern_length, flags,
         IREE_HAL_AMDGPU_HOST_QUEUE_SUBMISSION_FLAG_RETAIN_RESOURCES,
         &submission.ready);
     if (iree_status_is_ok(status) && !submission.ready) {
       status = iree_hal_amdgpu_host_queue_defer_fill(
           queue, &wait_semaphore_list, &signal_semaphore_list, target_buffer,
           target_offset, length, pattern_bits, pattern_length, flags,
           &submission.deferred_op);
       iree_hal_amdgpu_host_queue_op_submission_defer_for_capacity(&submission);
     }
   }
   return iree_hal_amdgpu_host_queue_op_submission_end(&submission, status);
 }

 iree_status_t iree_hal_amdgpu_host_queue_copy_buffer(
     iree_hal_amdgpu_host_queue_t* queue,
     const iree_hal_semaphore_list_t wait_semaphore_list,
     const iree_hal_semaphore_list_t signal_semaphore_list,
     iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset,
     iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
     iree_device_size_t length, iree_hal_copy_flags_t flags,
     iree_hal_profile_queue_event_type_t profile_event_type) {
   iree_hal_amdgpu_host_queue_op_submission_t submission;
   iree_hal_amdgpu_host_queue_op_submission_begin(queue, wait_semaphore_list,
                                                  &submission);
   iree_status_t status = iree_ok_status();
   if (submission.resolution.needs_deferral) {
     status = iree_hal_amdgpu_host_queue_defer_copy(
         queue, &wait_semaphore_list, &signal_semaphore_list, source_buffer,
         source_offset, target_buffer, target_offset, length, flags,
         profile_event_type, &submission.deferred_op);
   } else {
     status = iree_hal_amdgpu_host_queue_submit_copy(
         queue, &submission.resolution, signal_semaphore_list, source_buffer,
         source_offset, target_buffer, target_offset, length, flags,
         profile_event_type,
         IREE_HAL_AMDGPU_HOST_QUEUE_SUBMISSION_FLAG_RETAIN_RESOURCES,
         &submission.ready);
     if (iree_status_is_ok(status) && !submission.ready) {
       status = iree_hal_amdgpu_host_queue_defer_copy(
           queue, &wait_semaphore_list, &signal_semaphore_list, source_buffer,
           source_offset, target_buffer, target_offset, length, flags,
           profile_event_type, &submission.deferred_op);
       iree_hal_amdgpu_host_queue_op_submission_defer_for_capacity(&submission);
     }
   }
   return iree_hal_amdgpu_host_queue_op_submission_end(&submission, status);
 }

 // Queue copy entry point. The shared copy path is also used by file read/write
 // staging so all copy-shaped operations use the same wait/backpressure path.
 static iree_status_t iree_hal_amdgpu_host_queue_copy(
     iree_hal_amdgpu_virtual_queue_t* base_queue,
     const iree_hal_semaphore_list_t wait_semaphore_list,
     const iree_hal_semaphore_list_t signal_semaphore_list,
     iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset,
     iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
     iree_device_size_t length, iree_hal_copy_flags_t flags) {
   return iree_hal_amdgpu_host_queue_copy_buffer(
       (iree_hal_amdgpu_host_queue_t*)base_queue, wait_semaphore_list,
       signal_semaphore_list, source_buffer, source_offset, target_buffer,
       target_offset, length, flags, IREE_HAL_PROFILE_QUEUE_EVENT_TYPE_COPY);
 }

 // Queue update entry point. Immediate updates copy into queue-owned kernarg
 // memory; deferred updates copy into the pending-op arena.
 static iree_status_t iree_hal_amdgpu_host_queue_update(
     iree_hal_amdgpu_virtual_queue_t* base_queue,
     const iree_hal_semaphore_list_t wait_semaphore_list,
     const iree_hal_semaphore_list_t signal_semaphore_list,
     const void* source_buffer, iree_host_size_t source_offset,
     iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
     iree_device_size_t length, iree_hal_update_flags_t flags) {
   iree_hal_amdgpu_host_queue_t* queue =
       (iree_hal_amdgpu_host_queue_t*)base_queue;

   iree_hal_amdgpu_host_queue_op_submission_t submission;
   iree_hal_amdgpu_host_queue_op_submission_begin(queue, wait_semaphore_list,
                                                  &submission);
   iree_status_t status = iree_ok_status();
   if (submission.resolution.needs_deferral) {
     status = iree_hal_amdgpu_host_queue_defer_update(
         queue, &wait_semaphore_list, &signal_semaphore_list, source_buffer,
         source_offset, target_buffer, target_offset, length, flags,
         &submission.deferred_op);
   } else {
     status = iree_hal_amdgpu_host_queue_submit_update(
         queue, &submission.resolution, signal_semaphore_list, source_buffer,
         source_offset, target_buffer, target_offset, length, flags,
         IREE_HAL_AMDGPU_HOST_QUEUE_SUBMISSION_FLAG_RETAIN_RESOURCES,
         &submission.ready);
     if (iree_status_is_ok(status) && !submission.ready) {
       status = iree_hal_amdgpu_host_queue_defer_update(
           queue, &wait_semaphore_list, &signal_semaphore_list, source_buffer,
           source_offset, target_buffer, target_offset, length, flags,
           &submission.deferred_op);
       iree_hal_amdgpu_host_queue_op_submission_defer_for_capacity(&submission);
     }
   }
   return iree_hal_amdgpu_host_queue_op_submission_end(&submission, status);
 }

 static bool iree_hal_amdgpu_host_queue_is_noop_dispatch(
     const iree_hal_dispatch_config_t config, iree_hal_dispatch_flags_t flags) {
   return !iree_hal_dispatch_uses_indirect_parameters(flags) &&
          (config.workgroup_count[0] | config.workgroup_count[1] |
           config.workgroup_count[2]) == 0;
 }

 // Queue dispatch entry point. Empty direct dispatches route through the barrier
 // path so they still signal semaphores and profile as dispatch submissions.
 static iree_status_t iree_hal_amdgpu_host_queue_dispatch(
     iree_hal_amdgpu_virtual_queue_t* base_queue,
     const iree_hal_semaphore_list_t wait_semaphore_list,
     const iree_hal_semaphore_list_t signal_semaphore_list,
     iree_hal_executable_t* executable,
     iree_hal_executable_export_ordinal_t export_ordinal,
     const iree_hal_dispatch_config_t config, iree_const_byte_span_t constants,
     const iree_hal_buffer_ref_list_t bindings,
     iree_hal_dispatch_flags_t flags) {
   iree_hal_amdgpu_host_queue_t* queue =
       (iree_hal_amdgpu_host_queue_t*)base_queue;
   const bool is_noop_dispatch =
       iree_hal_amdgpu_host_queue_is_noop_dispatch(config, flags);

   iree_hal_amdgpu_host_queue_op_submission_t submission;
   iree_hal_amdgpu_host_queue_op_submission_begin(queue, wait_semaphore_list,
                                                  &submission);
   iree_status_t status = iree_ok_status();
   if (submission.resolution.needs_deferral) {
     if (is_noop_dispatch) {
       status = iree_hal_amdgpu_host_queue_defer_execute(
           queue, &wait_semaphore_list, &signal_semaphore_list,
           /*command_buffer=*/NULL, iree_hal_buffer_binding_table_empty(),
           IREE_HAL_EXECUTE_FLAG_NONE, &submission.deferred_op);
     } else {
       status = iree_hal_amdgpu_host_queue_defer_dispatch(
           queue, &wait_semaphore_list, &signal_semaphore_list, executable,
           export_ordinal, config, constants, bindings, flags,
           &submission.deferred_op);
     }
   } else if (is_noop_dispatch) {
     uint64_t submission_id = 0;
     iree_hal_amdgpu_host_queue_profile_event_info_t profile_event_info = {
         .type = IREE_HAL_PROFILE_QUEUE_EVENT_TYPE_DISPATCH,
         .operation_count = 0,
     };
     status = iree_hal_amdgpu_host_queue_try_submit_barrier(
         queue, &submission.resolution, signal_semaphore_list,
         (iree_hal_amdgpu_reclaim_action_t){0},
         /*operation_resources=*/NULL,
         /*operation_resource_count=*/0, &profile_event_info,
         iree_hal_amdgpu_host_queue_post_commit_callback_null(),
         /*resource_set=*/NULL,
         IREE_HAL_AMDGPU_HOST_QUEUE_SUBMISSION_FLAG_RETAIN_RESOURCES,
         &submission.ready, &submission_id);
     if (iree_status_is_ok(status) && submission.ready) {
       profile_event_info.submission_id = submission_id;
       iree_hal_amdgpu_host_queue_record_profile_queue_event(
           queue, &submission.resolution, signal_semaphore_list,
           &profile_event_info);
     }
     if (iree_status_is_ok(status) && !submission.ready) {
       status = iree_hal_amdgpu_host_queue_defer_execute(
           queue, &wait_semaphore_list, &signal_semaphore_list,
           /*command_buffer=*/NULL, iree_hal_buffer_binding_table_empty(),
           IREE_HAL_EXECUTE_FLAG_NONE, &submission.deferred_op);
       iree_hal_amdgpu_host_queue_op_submission_defer_for_capacity(&submission);
     }
   } else {
     status = iree_hal_amdgpu_host_queue_submit_dispatch(
         queue, &submission.resolution, signal_semaphore_list, executable,
         export_ordinal, config, constants, bindings, flags,
         IREE_HAL_AMDGPU_HOST_QUEUE_SUBMISSION_FLAG_RETAIN_RESOURCES,
         &submission.ready);
     if (iree_status_is_ok(status) && !submission.ready) {
       status = iree_hal_amdgpu_host_queue_defer_dispatch(
           queue, &wait_semaphore_list, &signal_semaphore_list, executable,
           export_ordinal, config, constants, bindings, flags,
           &submission.deferred_op);
       iree_hal_amdgpu_host_queue_op_submission_defer_for_capacity(&submission);
     }
   }
   return iree_hal_amdgpu_host_queue_op_submission_end(&submission, status);
 }

 static iree_status_t iree_hal_amdgpu_host_queue_read(
     iree_hal_amdgpu_virtual_queue_t* base_queue,
     const iree_hal_semaphore_list_t wait_semaphore_list,
     const iree_hal_semaphore_list_t signal_semaphore_list,
     iree_hal_file_t* source_file, uint64_t source_offset,
     iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
     iree_device_size_t length, iree_hal_read_flags_t flags) {
   return iree_hal_amdgpu_host_queue_read_file(
       base_queue, wait_semaphore_list, signal_semaphore_list, source_file,
       source_offset, target_buffer, target_offset, length, flags);
 }

 static iree_status_t iree_hal_amdgpu_host_queue_write(
     iree_hal_amdgpu_virtual_queue_t* base_queue,
     const iree_hal_semaphore_list_t wait_semaphore_list,
     const iree_hal_semaphore_list_t signal_semaphore_list,
     iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset,
     iree_hal_file_t* target_file, uint64_t target_offset,
     iree_device_size_t length, iree_hal_write_flags_t flags) {
   return iree_hal_amdgpu_host_queue_write_file(
       base_queue, wait_semaphore_list, signal_semaphore_list, source_buffer,
       source_offset, target_file, target_offset, length, flags);
 }

 iree_status_t iree_hal_amdgpu_host_queue_enqueue_host_action(
     iree_hal_amdgpu_host_queue_t* queue,
     const iree_hal_semaphore_list_t wait_semaphore_list,
     iree_hal_amdgpu_reclaim_action_t action,
     iree_hal_resource_t* const* operation_resources,
     iree_host_size_t operation_resource_count) {
   if (IREE_UNLIKELY(!action.fn)) {
     return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
                             "host action callback must be non-null");
   }
   if (IREE_UNLIKELY(operation_resource_count > 0 && !operation_resources)) {
     return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
                             "host action resources must be non-null");
   }

   iree_hal_amdgpu_host_queue_op_submission_t submission;
   iree_hal_amdgpu_host_queue_op_submission_begin(queue, wait_semaphore_list,
                                                  &submission);
   // Host actions execute on CPU threads and must observe device-produced
   // host-visible memory even when a semaphore edge itself is device-local.
   submission.resolution.inline_acquire_scope =
       iree_hal_amdgpu_host_queue_max_fence_scope(
           submission.resolution.inline_acquire_scope,
           IREE_HSA_FENCE_SCOPE_SYSTEM);
   submission.resolution.barrier_acquire_scope =
       iree_hal_amdgpu_host_queue_max_fence_scope(
           submission.resolution.barrier_acquire_scope,
           IREE_HSA_FENCE_SCOPE_SYSTEM);
   iree_status_t status = iree_ok_status();
   if (submission.resolution.needs_deferral) {
     status = iree_hal_amdgpu_host_queue_defer_host_action(
         queue, &wait_semaphore_list, action, operation_resources,
         operation_resource_count, &submission.deferred_op);
   } else {
     status = iree_hal_amdgpu_host_queue_try_submit_barrier(
         queue, &submission.resolution, iree_hal_semaphore_list_empty(), action,
         operation_resources, operation_resource_count,
         /*profile_event_info=*/NULL,
         iree_hal_amdgpu_host_queue_post_commit_callback_null(),
         /*resource_set=*/NULL,
         IREE_HAL_AMDGPU_HOST_QUEUE_SUBMISSION_FLAG_RETAIN_RESOURCES,
         &submission.ready, /*out_submission_id=*/NULL);
     if (iree_status_is_ok(status) && !submission.ready) {
       status = iree_hal_amdgpu_host_queue_defer_host_action(
           queue, &wait_semaphore_list, action, operation_resources,
           operation_resource_count, &submission.deferred_op);
       iree_hal_amdgpu_host_queue_op_submission_defer_for_capacity(&submission);
     }
   }
   return iree_hal_amdgpu_host_queue_op_submission_end(&submission, status);
 }

 static iree_status_t iree_hal_amdgpu_host_queue_host_call(
     iree_hal_amdgpu_virtual_queue_t* base_queue,
     const iree_hal_semaphore_list_t wait_semaphore_list,
     const iree_hal_semaphore_list_t signal_semaphore_list,
     iree_hal_host_call_t call, const uint64_t args[4],
     iree_hal_host_call_flags_t flags) {
   IREE_RETURN_IF_ERROR(
       iree_hal_amdgpu_host_queue_validate_host_call(call, args, flags));

   iree_hal_amdgpu_host_queue_t* queue =
       (iree_hal_amdgpu_host_queue_t*)base_queue;

   iree_hal_amdgpu_host_queue_op_submission_t submission;
   iree_hal_amdgpu_host_queue_op_submission_begin(queue, wait_semaphore_list,
                                                  &submission);
   iree_status_t status = iree_ok_status();
   if (submission.resolution.needs_deferral) {
     status = iree_hal_amdgpu_host_queue_defer_host_call(
         queue, &wait_semaphore_list, &signal_semaphore_list, call, args, flags,
         &submission.deferred_op);
   } else {
     status = iree_hal_amdgpu_host_queue_submit_host_call(
         queue, &submission.resolution, signal_semaphore_list, call, args, flags,
         &submission.ready);
     if (iree_status_is_ok(status) && !submission.ready) {
       status = iree_hal_amdgpu_host_queue_defer_host_call(
           queue, &wait_semaphore_list, &signal_semaphore_list, call, args,
           flags, &submission.deferred_op);
       iree_hal_amdgpu_host_queue_op_submission_defer_for_capacity(&submission);
     }
   }
   return iree_hal_amdgpu_host_queue_op_submission_end(&submission, status);
 }

 static iree_status_t iree_hal_amdgpu_host_queue_flush(
     iree_hal_amdgpu_virtual_queue_t* base_queue) {
   return iree_ok_status();
 }

 //===----------------------------------------------------------------------===//
 // Virtual queue vtable
 //===----------------------------------------------------------------------===//

 static void iree_hal_amdgpu_host_queue_deinitialize_vtable(
     iree_hal_amdgpu_virtual_queue_t* base_queue) {
   iree_hal_amdgpu_host_queue_deinitialize(
       (iree_hal_amdgpu_host_queue_t*)base_queue);
 }

 static const iree_hal_amdgpu_virtual_queue_vtable_t
     iree_hal_amdgpu_host_queue_vtable = {
         .deinitialize = iree_hal_amdgpu_host_queue_deinitialize_vtable,
         .trim = iree_hal_amdgpu_host_queue_trim,
         .alloca = iree_hal_amdgpu_host_queue_alloca,
         .dealloca = iree_hal_amdgpu_host_queue_dealloca,
         .fill = iree_hal_amdgpu_host_queue_fill,
         .update = iree_hal_amdgpu_host_queue_update,
         .copy = iree_hal_amdgpu_host_queue_copy,
         .read = iree_hal_amdgpu_host_queue_read,
         .write = iree_hal_amdgpu_host_queue_write,
         .host_call = iree_hal_amdgpu_host_queue_host_call,
         .dispatch = iree_hal_amdgpu_host_queue_dispatch,
         .execute = iree_hal_amdgpu_host_queue_execute,
         .flush = iree_hal_amdgpu_host_queue_flush,
 };