runtime/src/iree/hal/drivers/amdgpu/logical_device.c - 3p/openxla/iree - Git at Google

 // Copyright 2025 The IREE Authors
 //
 // Licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 #include "iree/hal/drivers/amdgpu/logical_device.h"

 #include "iree/async/frontier.h"
 #include "iree/async/frontier_tracker.h"
 #include "iree/async/util/proactor_pool.h"
 #include "iree/hal/drivers/amdgpu/abi/signal.h"
 #include "iree/hal/drivers/amdgpu/allocator.h"
 #include "iree/hal/drivers/amdgpu/api.h"
 #include "iree/hal/drivers/amdgpu/aql_command_buffer.h"
 #include "iree/hal/drivers/amdgpu/aql_program_builder.h"
 #include "iree/hal/drivers/amdgpu/executable.h"
 #include "iree/hal/drivers/amdgpu/executable_cache.h"
 #include "iree/hal/drivers/amdgpu/host_queue_profile.h"
 #include "iree/hal/drivers/amdgpu/host_queue_profile_events.h"
 #include "iree/hal/drivers/amdgpu/physical_device.h"
 #include "iree/hal/drivers/amdgpu/profile_counters.h"
 #include "iree/hal/drivers/amdgpu/profile_device_metrics.h"
 #include "iree/hal/drivers/amdgpu/profile_traces.h"
 #include "iree/hal/drivers/amdgpu/queue_affinity.h"
 #include "iree/hal/drivers/amdgpu/semaphore.h"
 #include "iree/hal/drivers/amdgpu/system.h"
 #include "iree/hal/drivers/amdgpu/util/epoch_signal_table.h"
 #include "iree/hal/drivers/amdgpu/util/kfd.h"
 #include "iree/hal/drivers/amdgpu/util/notification_ring.h"
 #include "iree/hal/drivers/amdgpu/util/topology.h"
 #include "iree/hal/drivers/amdgpu/util/vmem.h"
 #include "iree/hal/utils/file_registry.h"

 //===----------------------------------------------------------------------===//
 // Utilities
 //===----------------------------------------------------------------------===//

 static iree_hal_amdgpu_queue_affinity_domain_t
 iree_hal_amdgpu_logical_device_queue_affinity_domain(
     const iree_hal_amdgpu_logical_device_t* logical_device) {
   return (iree_hal_amdgpu_queue_affinity_domain_t){
       .supported_affinity = logical_device->queue_affinity_mask,
       .physical_device_count = logical_device->physical_device_count,
       .queue_count_per_physical_device =
           logical_device->system->topology.gpu_agent_queue_count,
   };
 }

 // Returns the queue for a flattened logical queue ordinal.
 static iree_status_t iree_hal_amdgpu_logical_device_queue_from_ordinal(
     iree_hal_amdgpu_logical_device_t* logical_device,
     iree_host_size_t queue_ordinal,
     iree_hal_amdgpu_virtual_queue_t** out_queue) {
   IREE_ASSERT_ARGUMENT(logical_device);
   IREE_ASSERT_ARGUMENT(out_queue);
   *out_queue = NULL;

   iree_hal_amdgpu_queue_affinity_resolved_t resolved;
   IREE_RETURN_IF_ERROR(iree_hal_amdgpu_queue_affinity_resolve_ordinal(
       iree_hal_amdgpu_logical_device_queue_affinity_domain(logical_device),
       queue_ordinal, &resolved));

   iree_hal_amdgpu_physical_device_t* physical_device =
       logical_device->physical_devices[resolved.physical_device_ordinal];
   if (IREE_UNLIKELY(resolved.physical_queue_ordinal >=
                     physical_device->host_queue_count)) {
     return iree_make_status(IREE_STATUS_INTERNAL,
                             "queue affinity ordinal %" PRIhsz
                             " maps to invalid host queue ordinal "
                             "%" PRIhsz " on physical device %" PRIhsz,
                             queue_ordinal, resolved.physical_queue_ordinal,
                             resolved.physical_device_ordinal);
   }

   *out_queue =
       &physical_device->host_queues[resolved.physical_queue_ordinal].base;
   return iree_ok_status();
 }

 //===----------------------------------------------------------------------===//
 // iree_hal_amdgpu_logical_device_options_t
 //===----------------------------------------------------------------------===//

 // Power-of-two size for the shared host small block pool in bytes.
 // Used for small host-side transients/wrappers of device-side resources.
 #define IREE_HAL_AMDGPU_LOGICAL_DEVICE_DEFAULT_SMALL_HOST_BLOCK_SIZE (8 * 1024)

 // Minimum size of a small host block (some structures require at least this
 // much memory).
 #define IREE_HAL_AMDGPU_LOGICAL_DEVICE_MIN_SMALL_HOST_BLOCK_SIZE (4 * 1024)

 // Power-of-two size for the shared host large block pool in bytes.
 // Used for resource tracking and other larger host-side transients.
 #define IREE_HAL_AMDGPU_LOGICAL_DEVICE_DEFAULT_LARGE_HOST_BLOCK_SIZE (64 * 1024)

 // Minimum size of a large host block (some structures require at least this
 // much memory).
 #define IREE_HAL_AMDGPU_LOGICAL_DEVICE_MIN_LARGE_HOST_BLOCK_SIZE (64 * 1024)

 IREE_API_EXPORT void iree_hal_amdgpu_logical_device_options_initialize(
     iree_hal_amdgpu_logical_device_options_t* out_options) {
   IREE_ASSERT_ARGUMENT(out_options);
   memset(out_options, 0, sizeof(*out_options));

   // TODO(benvanik): set defaults based on compiler configuration. Flags should
   // not be used as multiple devices may be configured within the process or the
   // hosting application may be authored in python/etc that does not use a flags
   // mechanism accessible here.

   out_options->host_block_pools.small.block_size =
       IREE_HAL_AMDGPU_LOGICAL_DEVICE_DEFAULT_SMALL_HOST_BLOCK_SIZE;
   out_options->host_block_pools.large.block_size =
       IREE_HAL_AMDGPU_LOGICAL_DEVICE_DEFAULT_LARGE_HOST_BLOCK_SIZE;
   out_options->host_block_pools.command_buffer.usable_block_size =
       IREE_HAL_AMDGPU_AQL_PROGRAM_DEFAULT_BLOCK_SIZE;

   out_options->device_block_pools.small.block_size =
       IREE_HAL_AMDGPU_PHYSICAL_DEVICE_SMALL_DEVICE_BLOCK_SIZE_DEFAULT;
   out_options->device_block_pools.small.initial_capacity =
       IREE_HAL_AMDGPU_PHYSICAL_DEVICE_SMALL_DEVICE_BLOCK_INITIAL_CAPACITY_DEFAULT;
   out_options->device_block_pools.large.block_size =
       IREE_HAL_AMDGPU_PHYSICAL_DEVICE_LARGE_DEVICE_BLOCK_SIZE_DEFAULT;
   out_options->device_block_pools.large.initial_capacity =
       IREE_HAL_AMDGPU_PHYSICAL_DEVICE_LARGE_DEVICE_BLOCK_INITIAL_CAPACITY_DEFAULT;

   out_options->default_pool.range_length =
       IREE_HAL_AMDGPU_PHYSICAL_DEVICE_DEFAULT_POOL_RANGE_LENGTH_DEFAULT;
   out_options->default_pool.alignment =
       IREE_HAL_AMDGPU_PHYSICAL_DEVICE_DEFAULT_POOL_ALIGNMENT_DEFAULT;
   out_options->default_pool.frontier_capacity =
       IREE_HAL_AMDGPU_PHYSICAL_DEVICE_DEFAULT_POOL_FRONTIER_CAPACITY_DEFAULT;

   out_options->queue_placement = IREE_HAL_AMDGPU_QUEUE_PLACEMENT_ANY;
   out_options->host_queues.aql_capacity =
       IREE_HAL_AMDGPU_PHYSICAL_DEVICE_DEFAULT_HOST_QUEUE_AQL_CAPACITY;
   out_options->host_queues.notification_capacity =
       IREE_HAL_AMDGPU_PHYSICAL_DEVICE_DEFAULT_HOST_QUEUE_NOTIFICATION_CAPACITY;
   out_options->host_queues.kernarg_capacity =
       IREE_HAL_AMDGPU_PHYSICAL_DEVICE_DEFAULT_HOST_QUEUE_KERNARG_CAPACITY;

   out_options->preallocate_pools = 1;
 }

 IREE_API_EXPORT iree_status_t iree_hal_amdgpu_logical_device_options_parse(
     iree_hal_amdgpu_logical_device_options_t* options,
     iree_string_pair_list_t params) {
   IREE_ASSERT_ARGUMENT(options);
   if (!params.count) return iree_ok_status();
   IREE_TRACE_ZONE_BEGIN(z0);

   const iree_string_pair_t* first_param = &params.pairs[0];
   iree_status_t status = iree_make_status(
       IREE_STATUS_INVALID_ARGUMENT,
       "AMDGPU logical device options do not support key/value parameter '%.*s'",
       (int)first_param->key.size, first_param->key.data);

   IREE_TRACE_ZONE_END(z0);
   return status;
 }

 iree_status_t iree_hal_amdgpu_logical_device_options_verify_supported_features(
     const iree_hal_amdgpu_logical_device_options_t* options) {
   IREE_ASSERT_ARGUMENT(options);
   switch (options->queue_placement) {
     case IREE_HAL_AMDGPU_QUEUE_PLACEMENT_ANY:
     case IREE_HAL_AMDGPU_QUEUE_PLACEMENT_HOST:
       break;
     case IREE_HAL_AMDGPU_QUEUE_PLACEMENT_DEVICE:
       return iree_make_status(
           IREE_STATUS_UNIMPLEMENTED,
           "AMDGPU device queue placement is not implemented; use "
           "queue_placement=any or queue_placement=host");
     default:
       return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
                               "invalid AMDGPU queue placement value %u",
                               (uint32_t)options->queue_placement);
   }
   if (options->exclusive_execution) {
     return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
                             "AMDGPU exclusive_execution is not implemented");
   }
   if (options->wait_active_for_ns < 0) {
     return iree_make_status(
         IREE_STATUS_OUT_OF_RANGE,
         "AMDGPU wait_active_for_ns must be non-negative (got %" PRId64 ")",
         options->wait_active_for_ns);
   }
   if (options->wait_active_for_ns != 0) {
     return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
                             "AMDGPU wait_active_for_ns is not implemented; "
                             "use 0");
   }
   return iree_ok_status();
 }

 static iree_status_t iree_hal_amdgpu_logical_device_options_verify(
     const iree_hal_amdgpu_logical_device_options_t* options,
     const iree_hal_amdgpu_libhsa_t* libhsa,
     const iree_hal_amdgpu_topology_t* topology) {
   IREE_ASSERT_ARGUMENT(options);
   IREE_ASSERT_ARGUMENT(topology);
   IREE_TRACE_ZONE_BEGIN(z0);

   IREE_RETURN_AND_END_ZONE_IF_ERROR(
       z0, iree_hal_amdgpu_logical_device_options_verify_supported_features(
               options));

   if (options->host_block_pools.small.block_size <
           IREE_HAL_AMDGPU_LOGICAL_DEVICE_MIN_SMALL_HOST_BLOCK_SIZE ||
       !iree_host_size_is_power_of_two(
           options->host_block_pools.small.block_size)) {
     IREE_RETURN_AND_END_ZONE_IF_ERROR(
         z0, iree_make_status(
                 IREE_STATUS_OUT_OF_RANGE,
                 "small host block pool size invalid, expected a "
                 "power-of-two greater than %d and got %" PRIhsz,
                 IREE_HAL_AMDGPU_LOGICAL_DEVICE_MIN_SMALL_HOST_BLOCK_SIZE,
                 options->host_block_pools.small.block_size));
   }
   if (options->host_block_pools.large.block_size <
           IREE_HAL_AMDGPU_LOGICAL_DEVICE_MIN_LARGE_HOST_BLOCK_SIZE ||
       !iree_host_size_is_power_of_two(
           options->host_block_pools.large.block_size)) {
     IREE_RETURN_AND_END_ZONE_IF_ERROR(
         z0, iree_make_status(
                 IREE_STATUS_OUT_OF_RANGE,
                 "large host block pool size invalid, expected a "
                 "power-of-two greater than %d and got %" PRIhsz,
                 IREE_HAL_AMDGPU_LOGICAL_DEVICE_MIN_LARGE_HOST_BLOCK_SIZE,
                 options->host_block_pools.large.block_size));
   }
   if (options->host_block_pools.command_buffer.usable_block_size <
           IREE_HAL_AMDGPU_AQL_PROGRAM_MIN_BLOCK_SIZE ||
       options->host_block_pools.command_buffer.usable_block_size > UINT32_MAX ||
       !iree_host_size_is_power_of_two(
           options->host_block_pools.command_buffer.usable_block_size)) {
     IREE_RETURN_AND_END_ZONE_IF_ERROR(
         z0, iree_make_status(
                 IREE_STATUS_OUT_OF_RANGE,
                 "command-buffer host block pool usable size invalid, expected "
                 "a power-of-two between %u and %u and got %" PRIhsz,
                 IREE_HAL_AMDGPU_AQL_PROGRAM_MIN_BLOCK_SIZE, UINT32_MAX,
                 options->host_block_pools.command_buffer.usable_block_size));
   }

   if (topology->gpu_agent_queue_count > UINT8_MAX) {
     IREE_RETURN_AND_END_ZONE_IF_ERROR(
         z0, iree_make_status(IREE_STATUS_OUT_OF_RANGE,
                              "gpu_agent_queue_count=%" PRIhsz
                              " exceeds the queue-axis encoding limit (%u)",
                              topology->gpu_agent_queue_count, UINT8_MAX));
   }
   iree_host_size_t total_queue_count = 0;
   if (!iree_host_size_checked_mul(topology->gpu_agent_count,
                                   topology->gpu_agent_queue_count,
                                   &total_queue_count) ||
       total_queue_count > IREE_HAL_MAX_QUEUES) {
     IREE_RETURN_AND_END_ZONE_IF_ERROR(
         z0,
         iree_make_status(
             IREE_STATUS_OUT_OF_RANGE,
             "topology queue space does not fit in iree_hal_queue_affinity_t "
             "(gpu_agent_count=%" PRIhsz ", gpu_agent_queue_count=%" PRIhsz
             ", max_total_queues=%" PRIhsz ")",
             topology->gpu_agent_count, topology->gpu_agent_queue_count,
             (iree_host_size_t)IREE_HAL_MAX_QUEUES));
   }
   if (!iree_host_size_is_power_of_two(options->host_queues.aql_capacity) ||
       !iree_host_size_is_power_of_two(
           options->host_queues.notification_capacity) ||
       !iree_host_size_is_power_of_two(options->host_queues.kernarg_capacity)) {
     IREE_RETURN_AND_END_ZONE_IF_ERROR(
         z0, iree_make_status(
                 IREE_STATUS_OUT_OF_RANGE,
                 "host queue AQL, notification, and kernarg capacities must all "
                 "be powers of two (got aql=%u, notification=%u, "
                 "kernarg_blocks=%u)",
                 options->host_queues.aql_capacity,
                 options->host_queues.notification_capacity,
                 options->host_queues.kernarg_capacity));
   }
   if (options->host_queues.kernarg_capacity / 2u <
       options->host_queues.aql_capacity) {
     IREE_RETURN_AND_END_ZONE_IF_ERROR(
         z0, iree_make_status(
                 IREE_STATUS_OUT_OF_RANGE,
                 "host queue kernarg capacity must be at least 2x the AQL queue "
                 "capacity (got kernarg_blocks=%u, aql_packets=%u)",
                 options->host_queues.kernarg_capacity,
                 options->host_queues.aql_capacity));
   }

   IREE_TRACE_ZONE_END(z0);
   return iree_ok_status();
 }

 //===----------------------------------------------------------------------===//
 // iree_hal_amdgpu_logical_device_t
 //===----------------------------------------------------------------------===//

 static const iree_hal_device_vtable_t iree_hal_amdgpu_logical_device_vtable;

 static iree_hal_amdgpu_logical_device_t* iree_hal_amdgpu_logical_device_cast(
     iree_hal_device_t* base_value) {
   IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_amdgpu_logical_device_vtable);
   return (iree_hal_amdgpu_logical_device_t*)base_value;
 }

 static bool iree_hal_amdgpu_logical_device_profiling_needs_hsa_timestamps(
     iree_hal_device_profiling_data_families_t data_families) {
   return iree_any_bit_set(data_families,
                           IREE_HAL_DEVICE_PROFILING_DATA_DEVICE_QUEUE_EVENTS |
                               IREE_HAL_DEVICE_PROFILING_DATA_DISPATCH_EVENTS |
                               IREE_HAL_DEVICE_PROFILING_DATA_COUNTER_SAMPLES |
                               IREE_HAL_DEVICE_PROFILING_DATA_EXECUTABLE_TRACES);
 }

 static iree_hal_device_profiling_data_families_t
 iree_hal_amdgpu_logical_device_lightweight_statistics_data_families(void) {
   return IREE_HAL_DEVICE_PROFILING_DATA_EXECUTABLE_METADATA |
          IREE_HAL_DEVICE_PROFILING_DATA_DEVICE_QUEUE_EVENTS |
          IREE_HAL_DEVICE_PROFILING_DATA_DISPATCH_EVENTS;
 }

 static iree_hal_device_profiling_options_t
 iree_hal_amdgpu_logical_device_resolve_profiling_options(
     const iree_hal_device_profiling_options_t* options) {
   iree_hal_device_profiling_options_t resolved_options = *options;
   if (resolved_options.data_families == IREE_HAL_DEVICE_PROFILING_DATA_NONE &&
       iree_hal_device_profiling_options_requests_lightweight_statistics(
           options)) {
     resolved_options.data_families =
         iree_hal_amdgpu_logical_device_lightweight_statistics_data_families();
   }
   resolved_options.flags &=
       ~IREE_HAL_DEVICE_PROFILING_FLAG_LIGHTWEIGHT_STATISTICS;
   return resolved_options;
 }

 // Power-of-two capacity for logical-device memory lifecycle event buffering.
 #define IREE_HAL_AMDGPU_LOGICAL_DEVICE_PROFILE_MEMORY_EVENT_CAPACITY (64 * 1024)

 // Power-of-two capacity for logical-device queue operation event buffering.
 #define IREE_HAL_AMDGPU_LOGICAL_DEVICE_PROFILE_QUEUE_EVENT_CAPACITY (64 * 1024)

 static iree_hal_profile_chunk_metadata_t
 iree_hal_amdgpu_logical_device_profile_session_metadata(
     iree_hal_amdgpu_logical_device_t* logical_device, uint64_t session_id) {
   iree_hal_profile_chunk_metadata_t metadata =
       iree_hal_profile_chunk_metadata_default();
   metadata.content_type = IREE_HAL_PROFILE_CONTENT_TYPE_SESSION;
   metadata.name = logical_device->identifier;
   metadata.session_id = session_id;
   return metadata;
 }

 static uint64_t iree_hal_amdgpu_logical_device_profile_queue_stream_id(
     uint32_t physical_device_ordinal, uint32_t queue_ordinal) {
   return ((uint64_t)physical_device_ordinal << 32) | (uint64_t)queue_ordinal;
 }

 static bool iree_hal_amdgpu_logical_device_profile_memory_events_requested(
     const iree_hal_amdgpu_logical_device_t* logical_device) {
   return iree_hal_device_profiling_options_requests_data(
              &logical_device->profiling.options,
              IREE_HAL_DEVICE_PROFILING_DATA_MEMORY_EVENTS) &&
          logical_device->profiling.options.sink &&
          iree_hal_amdgpu_profile_event_streams_has_memory_storage(
              &logical_device->profiling.event_streams);
 }

 bool iree_hal_amdgpu_logical_device_should_record_profile_memory_events(
     iree_hal_device_t* base_device) {
   iree_hal_amdgpu_logical_device_t* logical_device =
       iree_hal_amdgpu_logical_device_cast(base_device);
   return iree_hal_amdgpu_logical_device_profile_memory_events_requested(
       logical_device);
 }

 static void iree_hal_amdgpu_logical_device_reset_profile_options(
     iree_hal_amdgpu_logical_device_t* logical_device) {
   iree_hal_device_profiling_options_storage_free(
       logical_device->profiling.options_storage,
       logical_device->host_allocator);
   logical_device->profiling.options_storage = NULL;
   logical_device->profiling.options = (iree_hal_device_profiling_options_t){0};
 }

 bool iree_hal_amdgpu_logical_device_should_profile_dispatch(
     iree_hal_amdgpu_logical_device_t* logical_device, uint64_t executable_id,
     uint32_t export_ordinal, uint64_t command_buffer_id, uint32_t command_index,
     uint32_t physical_device_ordinal, uint32_t queue_ordinal) {
   if (!iree_any_bit_set(logical_device->profiling.options.data_families,
                         IREE_HAL_DEVICE_PROFILING_DATA_DISPATCH_EVENTS |
                             IREE_HAL_DEVICE_PROFILING_DATA_COUNTER_SAMPLES |
                             IREE_HAL_DEVICE_PROFILING_DATA_EXECUTABLE_TRACES)) {
     return false;
   }

   const iree_hal_profile_capture_filter_t* filter =
       &logical_device->profiling.options.capture_filter;
   if (!iree_hal_profile_capture_filter_matches_location(
           filter, command_buffer_id, command_index, physical_device_ordinal,
           queue_ordinal)) {
     return false;
   }
   if (iree_any_bit_set(
           filter->flags,
           IREE_HAL_PROFILE_CAPTURE_FILTER_FLAG_EXECUTABLE_EXPORT_PATTERN)) {
     return iree_hal_amdgpu_profile_metadata_export_matches(
         &logical_device->profile_metadata, executable_id, export_ordinal,
         filter->executable_export_pattern);
   }
   return true;
 }

 uint64_t iree_hal_amdgpu_logical_device_allocate_profile_memory_allocation_id(
     iree_hal_device_t* base_device, uint64_t* out_session_id) {
   iree_hal_amdgpu_logical_device_t* logical_device =
       iree_hal_amdgpu_logical_device_cast(base_device);
   *out_session_id = 0;
   if (!iree_hal_amdgpu_logical_device_profile_memory_events_requested(
           logical_device)) {
     return 0;
   }

   return iree_hal_amdgpu_profile_event_streams_allocate_memory_allocation_id(
       &logical_device->profiling.event_streams,
       logical_device->profiling.session_id, out_session_id);
 }

 bool iree_hal_amdgpu_logical_device_record_profile_memory_event_for_session(
     iree_hal_device_t* base_device, uint64_t session_id,
     const iree_hal_profile_memory_event_t* event) {
   iree_hal_amdgpu_logical_device_t* logical_device =
       iree_hal_amdgpu_logical_device_cast(base_device);
   if (!iree_hal_amdgpu_logical_device_profile_memory_events_requested(
           logical_device)) {
     return false;
   }

   return iree_hal_amdgpu_profile_event_streams_record_memory_event(
       &logical_device->profiling.event_streams,
       logical_device->profiling.session_id, session_id, event);
 }

 bool iree_hal_amdgpu_logical_device_record_profile_memory_event(
     iree_hal_device_t* base_device,
     const iree_hal_profile_memory_event_t* event) {
   return iree_hal_amdgpu_logical_device_record_profile_memory_event_for_session(
       base_device, /*session_id=*/0, event);
 }

 static bool iree_hal_amdgpu_logical_device_profile_queue_events_requested(
     const iree_hal_amdgpu_logical_device_t* logical_device) {
   return iree_hal_device_profiling_options_requests_data(
              &logical_device->profiling.options,
              IREE_HAL_DEVICE_PROFILING_DATA_QUEUE_EVENTS) &&
          logical_device->profiling.options.sink &&
          iree_hal_amdgpu_profile_event_streams_has_queue_storage(
              &logical_device->profiling.event_streams);
 }

 void iree_hal_amdgpu_logical_device_record_profile_queue_event(
     iree_hal_device_t* base_device,
     const iree_hal_profile_queue_event_t* event) {
   iree_hal_amdgpu_logical_device_t* logical_device =
       iree_hal_amdgpu_logical_device_cast(base_device);
   if (!iree_hal_amdgpu_logical_device_profile_queue_events_requested(
           logical_device)) {
     return;
   }

   iree_hal_amdgpu_profile_event_streams_record_queue_event(
       &logical_device->profiling.event_streams, event);
 }

 static iree_status_t
 iree_hal_amdgpu_logical_device_sample_profile_clock_correlation(
     iree_hal_amdgpu_logical_device_t* logical_device,
     iree_hal_amdgpu_physical_device_t* physical_device,
     iree_hal_profile_clock_correlation_record_t* out_record) {
   if (IREE_UNLIKELY(physical_device->device_ordinal > UINT32_MAX)) {
     return iree_make_status(
         IREE_STATUS_OUT_OF_RANGE,
         "profile clock correlation physical device ordinal out of range: "
         "%" PRIhsz,
         physical_device->device_ordinal);
   }

   iree_hal_amdgpu_clock_counters_t counters = {0};
   const iree_time_t host_time_begin_ns = iree_time_now();
   iree_status_t status = iree_hal_amdgpu_kfd_get_clock_counters(
       logical_device->system->kfd_fd, physical_device->kfd_gpu_uid, &counters);
   const iree_time_t host_time_end_ns = iree_time_now();

   if (iree_status_is_ok(status)) {
     *out_record = iree_hal_profile_clock_correlation_record_default();
     out_record->flags =
         IREE_HAL_PROFILE_CLOCK_CORRELATION_FLAG_DEVICE_TICK |
         IREE_HAL_PROFILE_CLOCK_CORRELATION_FLAG_HOST_CPU_TIMESTAMP |
         IREE_HAL_PROFILE_CLOCK_CORRELATION_FLAG_HOST_SYSTEM_TIMESTAMP |
         IREE_HAL_PROFILE_CLOCK_CORRELATION_FLAG_HOST_TIME_BRACKET;
     out_record->physical_device_ordinal =
         (uint32_t)physical_device->device_ordinal;
     out_record->sample_id =
         logical_device->profiling.next_clock_correlation_sample_id++;
     out_record->device_tick = counters.gpu_clock_counter;
     out_record->host_cpu_timestamp_ns = counters.cpu_clock_counter;
     out_record->host_system_timestamp = counters.system_clock_counter;
     out_record->host_system_frequency_hz = counters.system_clock_freq;
     out_record->host_time_begin_ns = host_time_begin_ns;
     out_record->host_time_end_ns = host_time_end_ns;
   } else {
     status = iree_status_annotate_f(
         status,
         "sampling profile clock correlation for physical_device_ordinal=%zu "
         "gpu_uid=%" PRIu32,
         physical_device->device_ordinal, physical_device->kfd_gpu_uid);
   }
   return status;
 }

 static iree_status_t iree_hal_amdgpu_logical_device_write_profile_devices(
     iree_hal_amdgpu_logical_device_t* logical_device,
     iree_hal_profile_sink_t* sink, uint64_t session_id) {
   IREE_TRACE_ZONE_BEGIN(z0);

   const iree_host_size_t record_count = logical_device->physical_device_count;
   if (record_count == 0) {
     IREE_TRACE_ZONE_END(z0);
     return iree_make_status(
         IREE_STATUS_INTERNAL,
         "logical device has no physical devices (initialization incomplete)");
   }

   iree_host_size_t records_size = 0;
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
       z0, IREE_STRUCT_LAYOUT(
               0, &records_size,
               IREE_STRUCT_FIELD(record_count, iree_hal_profile_device_record_t,
                                 NULL)));
   iree_hal_profile_device_record_t* records = NULL;
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
       z0, iree_allocator_malloc(logical_device->host_allocator, records_size,
                                 (void**)&records));

   iree_status_t status = iree_ok_status();
   for (iree_host_size_t i = 0; i < record_count && iree_status_is_ok(status);
        ++i) {
     iree_hal_amdgpu_physical_device_t* physical_device =
         logical_device->physical_devices[i];
     if (IREE_UNLIKELY(physical_device->device_ordinal > UINT32_MAX ||
                       physical_device->host_queue_count > UINT32_MAX)) {
       status = iree_make_status(
           IREE_STATUS_OUT_OF_RANGE,
           "profile device metadata ordinals out of range: device=%" PRIhsz
           ", queue_count=%" PRIhsz,
           physical_device->device_ordinal, physical_device->host_queue_count);
       break;
     }

     records[i] = iree_hal_profile_device_record_default();
     records[i].physical_device_ordinal =
         (uint32_t)physical_device->device_ordinal;
     records[i].queue_count = (uint32_t)physical_device->host_queue_count;
     if (physical_device->has_physical_device_uuid) {
       records[i].flags |= IREE_HAL_PROFILE_DEVICE_FLAG_PHYSICAL_DEVICE_UUID;
       memcpy(records[i].physical_device_uuid,
              physical_device->physical_device_uuid,
              sizeof(records[i].physical_device_uuid));
     }
   }

   if (iree_status_is_ok(status)) {
     iree_hal_profile_chunk_metadata_t metadata =
         iree_hal_profile_chunk_metadata_default();
     metadata.content_type = IREE_HAL_PROFILE_CONTENT_TYPE_DEVICES;
     metadata.name = logical_device->identifier;
     metadata.session_id = session_id;
     iree_const_byte_span_t iovec =
         iree_make_const_byte_span(records, records_size);
     status = iree_hal_profile_sink_write(sink, &metadata, 1, &iovec);
   }

   iree_allocator_free(logical_device->host_allocator, records);
   IREE_TRACE_ZONE_END(z0);
   return status;
 }

 static iree_status_t iree_hal_amdgpu_logical_device_write_profile_queues(
     iree_hal_amdgpu_logical_device_t* logical_device,
     iree_hal_profile_sink_t* sink, uint64_t session_id) {
   IREE_TRACE_ZONE_BEGIN(z0);

   iree_host_size_t record_count = 0;
   for (iree_host_size_t i = 0; i < logical_device->physical_device_count; ++i) {
     iree_hal_amdgpu_physical_device_t* physical_device =
         logical_device->physical_devices[i];
     if (IREE_UNLIKELY(!iree_host_size_checked_add(
             record_count, physical_device->host_queue_count, &record_count))) {
       IREE_TRACE_ZONE_END(z0);
       return iree_make_status(IREE_STATUS_OUT_OF_RANGE,
                               "profile queue metadata count overflow");
     }
   }
   if (record_count == 0) {
     IREE_TRACE_ZONE_END(z0);
     return iree_make_status(
         IREE_STATUS_INTERNAL,
         "logical device has no host queues (initialization incomplete)");
   }

   iree_host_size_t records_size = 0;
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
       z0, IREE_STRUCT_LAYOUT(
               0, &records_size,
               IREE_STRUCT_FIELD(record_count, iree_hal_profile_queue_record_t,
                                 NULL)));
   iree_hal_profile_queue_record_t* records = NULL;
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
       z0, iree_allocator_malloc(logical_device->host_allocator, records_size,
                                 (void**)&records));

   iree_status_t status = iree_ok_status();
   iree_host_size_t record_ordinal = 0;
   for (iree_host_size_t i = 0;
        i < logical_device->physical_device_count && iree_status_is_ok(status);
        ++i) {
     iree_hal_amdgpu_physical_device_t* physical_device =
         logical_device->physical_devices[i];
     if (IREE_UNLIKELY(physical_device->device_ordinal > UINT32_MAX)) {
       status = iree_make_status(IREE_STATUS_OUT_OF_RANGE,
                                 "profile queue metadata physical device "
                                 "ordinal out of range: %" PRIhsz,
                                 physical_device->device_ordinal);
       break;
     }
     const uint32_t physical_device_ordinal =
         (uint32_t)physical_device->device_ordinal;
     for (iree_host_size_t j = 0;
          j < physical_device->host_queue_count && iree_status_is_ok(status);
          ++j) {
       if (IREE_UNLIKELY(j > UINT32_MAX)) {
         status = iree_make_status(
             IREE_STATUS_OUT_OF_RANGE,
             "profile queue metadata queue ordinal out of range: %" PRIhsz, j);
         break;
       }
       const uint32_t queue_ordinal = (uint32_t)j;
       records[record_ordinal] = iree_hal_profile_queue_record_default();
       records[record_ordinal].physical_device_ordinal = physical_device_ordinal;
       records[record_ordinal].queue_ordinal = queue_ordinal;
       records[record_ordinal].stream_id =
           iree_hal_amdgpu_logical_device_profile_queue_stream_id(
               physical_device_ordinal, queue_ordinal);
       ++record_ordinal;
     }
   }

   if (iree_status_is_ok(status)) {
     iree_hal_profile_chunk_metadata_t metadata =
         iree_hal_profile_chunk_metadata_default();
     metadata.content_type = IREE_HAL_PROFILE_CONTENT_TYPE_QUEUES;
     metadata.name = logical_device->identifier;
     metadata.session_id = session_id;
     iree_const_byte_span_t iovec =
         iree_make_const_byte_span(records, records_size);
     status = iree_hal_profile_sink_write(sink, &metadata, 1, &iovec);
   }

   iree_allocator_free(logical_device->host_allocator, records);
   IREE_TRACE_ZONE_END(z0);
   return status;
 }

 static iree_status_t
 iree_hal_amdgpu_logical_device_write_profile_clock_correlations(
     iree_hal_amdgpu_logical_device_t* logical_device,
     iree_hal_profile_sink_t* sink, uint64_t session_id) {
   IREE_TRACE_ZONE_BEGIN(z0);

   const iree_host_size_t record_count = logical_device->physical_device_count;
   if (record_count == 0) {
     IREE_TRACE_ZONE_END(z0);
     return iree_make_status(
         IREE_STATUS_INTERNAL,
         "logical device has no physical devices (initialization incomplete)");
   }

   iree_host_size_t records_size = 0;
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
       z0, IREE_STRUCT_LAYOUT(
               0, &records_size,
               IREE_STRUCT_FIELD(record_count,
                                 iree_hal_profile_clock_correlation_record_t,
                                 NULL)));
   iree_hal_profile_clock_correlation_record_t* records = NULL;
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
       z0, iree_allocator_malloc(logical_device->host_allocator, records_size,
                                 (void**)&records));

   iree_status_t status = iree_ok_status();
   for (iree_host_size_t i = 0; i < record_count && iree_status_is_ok(status);
        ++i) {
     status = iree_hal_amdgpu_logical_device_sample_profile_clock_correlation(
         logical_device, logical_device->physical_devices[i], &records[i]);
   }

   if (iree_status_is_ok(status)) {
     iree_hal_profile_chunk_metadata_t metadata =
         iree_hal_profile_chunk_metadata_default();
     metadata.content_type = IREE_HAL_PROFILE_CONTENT_TYPE_CLOCK_CORRELATIONS;
     metadata.name = logical_device->identifier;
     metadata.session_id = session_id;
     iree_const_byte_span_t iovec =
         iree_make_const_byte_span(records, records_size);
     status = iree_hal_profile_sink_write(sink, &metadata, 1, &iovec);
   }

   iree_allocator_free(logical_device->host_allocator, records);
   IREE_TRACE_ZONE_END(z0);
   return status;
 }

 static bool iree_hal_amdgpu_logical_device_profile_needs_executable_artifacts(
     iree_hal_device_profiling_data_families_t data_families) {
   return iree_any_bit_set(data_families,
                           IREE_HAL_DEVICE_PROFILING_DATA_EXECUTABLE_METADATA |
                               IREE_HAL_DEVICE_PROFILING_DATA_EXECUTABLE_TRACES);
 }

 static iree_status_t iree_hal_amdgpu_logical_device_write_profile_metadata(
     iree_hal_amdgpu_logical_device_t* logical_device,
     iree_hal_profile_sink_t* sink, uint64_t session_id,
     iree_hal_device_profiling_data_families_t data_families) {
   const bool emit_executable_artifacts =
       iree_hal_amdgpu_logical_device_profile_needs_executable_artifacts(
           data_families);
   IREE_RETURN_IF_ERROR(iree_hal_amdgpu_logical_device_write_profile_devices(
       logical_device, sink, session_id));
   IREE_RETURN_IF_ERROR(iree_hal_amdgpu_logical_device_write_profile_queues(
       logical_device, sink, session_id));
   IREE_RETURN_IF_ERROR(iree_hal_amdgpu_profile_metadata_write(
       &logical_device->profile_metadata, sink, session_id,
       logical_device->identifier, emit_executable_artifacts,
       &logical_device->profiling.metadata_cursor));
   return iree_hal_amdgpu_logical_device_write_profile_clock_correlations(
       logical_device, sink, session_id);
 }

 static iree_status_t iree_hal_amdgpu_logical_device_write_profile_events(
     iree_hal_amdgpu_logical_device_t* logical_device,
     iree_hal_profile_sink_t* sink, uint64_t session_id) {
   IREE_TRACE_ZONE_BEGIN(z0);
   iree_status_t status = iree_hal_amdgpu_profile_event_streams_write_queue(
       &logical_device->profiling.event_streams, sink, session_id,
       logical_device->host_allocator);
   if (iree_status_is_ok(status)) {
     status = iree_hal_amdgpu_profile_event_streams_write_memory(
         &logical_device->profiling.event_streams, sink, session_id,
         logical_device->host_allocator);
   }
   for (iree_host_size_t i = 0;
        i < logical_device->physical_device_count && iree_status_is_ok(status);
        ++i) {
     iree_hal_amdgpu_physical_device_t* physical_device =
         logical_device->physical_devices[i];
     for (iree_host_size_t j = 0;
          j < physical_device->host_queue_count && iree_status_is_ok(status);
          ++j) {
       status = iree_hal_amdgpu_host_queue_write_profile_events(
           &physical_device->host_queues[j], sink, session_id);
     }
   }
   IREE_TRACE_ZONE_END(z0);
   return status;
 }

 static iree_hal_amdgpu_host_queue_profile_flags_t
 iree_hal_amdgpu_logical_device_queue_profile_flags(
     const iree_hal_device_profiling_options_t* options) {
   iree_hal_amdgpu_host_queue_profile_flags_t flags =
       IREE_HAL_AMDGPU_HOST_QUEUE_PROFILE_FLAG_NONE;
   if (iree_hal_device_profiling_options_requests_data(
           options, IREE_HAL_DEVICE_PROFILING_DATA_QUEUE_EVENTS)) {
     flags |= IREE_HAL_AMDGPU_HOST_QUEUE_PROFILE_FLAG_QUEUE_EVENTS;
   }
   if (iree_hal_device_profiling_options_requests_data(
           options, IREE_HAL_DEVICE_PROFILING_DATA_DEVICE_QUEUE_EVENTS)) {
     flags |= IREE_HAL_AMDGPU_HOST_QUEUE_PROFILE_FLAG_QUEUE_DEVICE_EVENTS;
   }
   if (iree_any_bit_set(options->data_families,
                        IREE_HAL_DEVICE_PROFILING_DATA_DISPATCH_EVENTS |
                            IREE_HAL_DEVICE_PROFILING_DATA_COUNTER_SAMPLES |
                            IREE_HAL_DEVICE_PROFILING_DATA_EXECUTABLE_TRACES)) {
     flags |= IREE_HAL_AMDGPU_HOST_QUEUE_PROFILE_FLAG_DISPATCHES;
   }
   return flags;
 }

 static void iree_hal_amdgpu_logical_device_set_queue_profiling_enabled(
     iree_hal_amdgpu_logical_device_t* logical_device,
     iree_hal_amdgpu_host_queue_profile_flags_t flags) {
   for (iree_host_size_t i = 0; i < logical_device->physical_device_count; ++i) {
     iree_hal_amdgpu_physical_device_t* physical_device =
         logical_device->physical_devices[i];
     for (iree_host_size_t j = 0; j < physical_device->host_queue_count; ++j) {
       iree_hal_amdgpu_host_queue_set_profile_flags(
           &physical_device->host_queues[j], flags);
     }
   }
 }

 static iree_status_t iree_hal_amdgpu_logical_device_set_hsa_profiling_enabled(
     iree_hal_amdgpu_logical_device_t* logical_device, bool enabled) {
   IREE_TRACE_ZONE_BEGIN(z0);
   IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, enabled ? 1 : 0);

   iree_status_t status = iree_ok_status();
   iree_host_size_t changed_count = 0;
   for (iree_host_size_t i = 0;
        i < logical_device->physical_device_count && iree_status_is_ok(status);
        ++i) {
     status = iree_hal_amdgpu_physical_device_set_hsa_profiling_enabled(
         logical_device->physical_devices[i], enabled);
     if (iree_status_is_ok(status)) {
       ++changed_count;
     }
   }

   if (!iree_status_is_ok(status) && enabled) {
     for (iree_host_size_t i = 0; i < changed_count; ++i) {
       status = iree_status_join(
           status, iree_hal_amdgpu_physical_device_set_hsa_profiling_enabled(
                       logical_device->physical_devices[i], false));
     }
   } else if (!enabled) {
     for (iree_host_size_t i = changed_count;
          i < logical_device->physical_device_count; ++i) {
       status = iree_status_join(
           status, iree_hal_amdgpu_physical_device_set_hsa_profiling_enabled(
                       logical_device->physical_devices[i], false));
     }
   }

   IREE_TRACE_ZONE_END(z0);
   return status;
 }

 static iree_status_t
 iree_hal_amdgpu_logical_device_set_counter_profiling_enabled(
     iree_hal_amdgpu_logical_device_t* logical_device,
     iree_hal_amdgpu_profile_counter_session_t* counter_session, bool enabled) {
   if (!iree_hal_amdgpu_profile_counter_session_is_active(counter_session)) {
     return iree_ok_status();
   }
   IREE_TRACE_ZONE_BEGIN(z0);
   IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, enabled ? 1 : 0);

   iree_status_t status = iree_ok_status();
   iree_host_size_t changed_queue_count = 0;
   for (iree_host_size_t i = 0;
        i < logical_device->physical_device_count && iree_status_is_ok(status);
        ++i) {
     iree_hal_amdgpu_physical_device_t* physical_device =
         logical_device->physical_devices[i];
     for (iree_host_size_t j = 0;
          j < physical_device->host_queue_count && iree_status_is_ok(status);
          ++j) {
       iree_hal_amdgpu_host_queue_t* queue = &physical_device->host_queues[j];
       if (enabled) {
         status = iree_hal_amdgpu_host_queue_enable_profile_counters(
             queue, counter_session);
         if (iree_status_is_ok(status)) {
           ++changed_queue_count;
         }
       } else {
         iree_hal_amdgpu_host_queue_disable_profile_counters(queue);
       }
     }
   }

   if (!iree_status_is_ok(status) && enabled) {
     for (iree_host_size_t i = 0, seen_queue_count = 0;
          i < logical_device->physical_device_count &&
          seen_queue_count < changed_queue_count;
          ++i) {
       iree_hal_amdgpu_physical_device_t* physical_device =
           logical_device->physical_devices[i];
       for (iree_host_size_t j = 0; j < physical_device->host_queue_count &&
                                    seen_queue_count < changed_queue_count;
            ++j, ++seen_queue_count) {
         iree_hal_amdgpu_host_queue_disable_profile_counters(
             &physical_device->host_queues[j]);
       }
     }
   }

   IREE_TRACE_ZONE_END(z0);
   return status;
 }

 static iree_status_t iree_hal_amdgpu_logical_device_set_trace_profiling_enabled(
     iree_hal_amdgpu_logical_device_t* logical_device,
     iree_hal_amdgpu_profile_trace_session_t* trace_session, bool enabled) {
   if (!iree_hal_amdgpu_profile_trace_session_is_active(trace_session)) {
     return iree_ok_status();
   }
   IREE_TRACE_ZONE_BEGIN(z0);
   IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, enabled ? 1 : 0);

   iree_status_t status = iree_ok_status();
   iree_host_size_t changed_queue_count = 0;
   for (iree_host_size_t i = 0;
        i < logical_device->physical_device_count && iree_status_is_ok(status);
        ++i) {
     iree_hal_amdgpu_physical_device_t* physical_device =
         logical_device->physical_devices[i];
     for (iree_host_size_t j = 0;
          j < physical_device->host_queue_count && iree_status_is_ok(status);
          ++j) {
       iree_hal_amdgpu_host_queue_t* queue = &physical_device->host_queues[j];
       if (enabled) {
         status = iree_hal_amdgpu_host_queue_enable_profile_traces(
             queue, trace_session);
         if (iree_status_is_ok(status)) {
           ++changed_queue_count;
         }
       } else {
         iree_hal_amdgpu_host_queue_disable_profile_traces(queue);
       }
     }
   }

   if (!iree_status_is_ok(status) && enabled) {
     for (iree_host_size_t i = 0, seen_queue_count = 0;
          i < logical_device->physical_device_count &&
          seen_queue_count < changed_queue_count;
          ++i) {
       iree_hal_amdgpu_physical_device_t* physical_device =
           logical_device->physical_devices[i];
       for (iree_host_size_t j = 0; j < physical_device->host_queue_count &&
                                    seen_queue_count < changed_queue_count;
            ++j, ++seen_queue_count) {
         iree_hal_amdgpu_host_queue_disable_profile_traces(
             &physical_device->host_queues[j]);
       }
     }
   }

   IREE_TRACE_ZONE_END(z0);
   return status;
 }

 // Selects one host queue from |queue_affinity| after intersecting with this
 // logical device's supported queues. The current policy is deterministic
 // first-set-bit selection, which is enough to honor explicit HIP stream
 // affinities and keeps the CTS path stable. A multi-bit affinity therefore acts
 // as "any of these queues"; queue_flush handles multi-bit masks by iterating
 // all selected queues instead.
 static iree_status_t iree_hal_amdgpu_logical_device_select_host_queue(
     iree_hal_amdgpu_logical_device_t* logical_device,
     iree_hal_queue_affinity_t queue_affinity,
     iree_hal_amdgpu_virtual_queue_t** out_queue) {
   IREE_ASSERT_ARGUMENT(logical_device);
   IREE_ASSERT_ARGUMENT(out_queue);
   *out_queue = NULL;

   iree_hal_amdgpu_queue_affinity_resolved_t resolved;
   IREE_RETURN_IF_ERROR(iree_hal_amdgpu_queue_affinity_resolve(
       iree_hal_amdgpu_logical_device_queue_affinity_domain(logical_device),
       queue_affinity, &resolved));
   return iree_hal_amdgpu_logical_device_queue_from_ordinal(
       logical_device, resolved.queue_ordinal, out_queue);
 }

 // Selects the physical device backing |queue_affinity| for pool creation.
 //
 // Queue pools are scoped to one physical memory domain, but |queue_affinity|
 // still has the usual "any queue in this mask" meaning. This helper therefore
 // collapses multi-bit masks with the same deterministic first-set-bit policy as
 // host queue submission. In practice IREE_HAL_QUEUE_AFFINITY_ANY usually
 // selects queue 0 after intersecting with this device's supported queue mask.
 static iree_status_t
 iree_hal_amdgpu_logical_device_select_queue_pool_physical_device(
     iree_hal_amdgpu_logical_device_t* logical_device,
     iree_hal_queue_affinity_t queue_affinity,
     iree_hal_amdgpu_physical_device_t** out_physical_device) {
   IREE_ASSERT_ARGUMENT(logical_device);
   IREE_ASSERT_ARGUMENT(out_physical_device);
   *out_physical_device = NULL;

   iree_hal_amdgpu_queue_affinity_resolved_t resolved;
   IREE_RETURN_IF_ERROR(iree_hal_amdgpu_queue_affinity_resolve(
       iree_hal_amdgpu_logical_device_queue_affinity_domain(logical_device),
       queue_affinity, &resolved));
   *out_physical_device =
       logical_device->physical_devices[resolved.physical_device_ordinal];
   return iree_ok_status();
 }

 // Normalizes command-buffer queue affinity to queues on one physical device and
 // returns the physical device ordinal whose executable kernel objects may be
 // baked into the recorded command stream.
 static iree_status_t
 iree_hal_amdgpu_logical_device_normalize_command_buffer_affinity(
     iree_hal_amdgpu_logical_device_t* logical_device,
     iree_hal_queue_affinity_t queue_affinity,
     iree_hal_queue_affinity_t* out_queue_affinity,
     iree_host_size_t* out_device_ordinal) {
   *out_queue_affinity = 0;
   *out_device_ordinal = 0;

   return iree_hal_amdgpu_queue_affinity_normalize_for_physical_device(
       iree_hal_amdgpu_logical_device_queue_affinity_domain(logical_device),
       queue_affinity, out_queue_affinity, out_device_ordinal);
 }

 static bool iree_hal_amdgpu_logical_device_query_pool_epoch(
     void* user_data, iree_async_axis_t axis, uint64_t epoch) {
   iree_hal_amdgpu_logical_device_t* logical_device =
       (iree_hal_amdgpu_logical_device_t*)user_data;
   hsa_signal_t epoch_signal = {0};
   if (!iree_hal_amdgpu_epoch_signal_table_lookup(
           logical_device->host_queue_epoch_table, axis, &epoch_signal)) {
     return false;
   }
   iree_amd_signal_t* signal =
       (iree_amd_signal_t*)(uintptr_t)epoch_signal.handle;
   const iree_hsa_signal_value_t current_value = iree_atomic_load(
       (iree_atomic_int64_t*)&signal->value, iree_memory_order_acquire);
   if (IREE_UNLIKELY(current_value < 0 ||
                     current_value > IREE_HAL_AMDGPU_EPOCH_INITIAL_VALUE)) {
     return false;
   }
   const uint64_t current_epoch =
       (uint64_t)IREE_HAL_AMDGPU_EPOCH_INITIAL_VALUE - (uint64_t)current_value;
   return current_epoch >= epoch;
 }

 static void iree_hal_amdgpu_logical_device_deassign_frontier(
     iree_hal_amdgpu_logical_device_t* logical_device) {
   IREE_TRACE_ZONE_BEGIN(z0);
   for (iree_host_size_t i = 0; i < logical_device->physical_device_count; ++i) {
     iree_hal_amdgpu_physical_device_deassign_frontier(
         logical_device->physical_devices[i]);
   }

   iree_async_frontier_tracker_release(logical_device->frontier_tracker);
   logical_device->frontier_tracker = NULL;
   logical_device->axis = 0;
   memset(&logical_device->topology_info, 0,
          sizeof(logical_device->topology_info));

   if (logical_device->host_queue_epoch_table) {
     iree_allocator_free(logical_device->host_allocator,
                         logical_device->host_queue_epoch_table);
     logical_device->host_queue_epoch_table = NULL;
   }
   IREE_TRACE_ZONE_END(z0);
 }

 static void iree_hal_amdgpu_logical_device_error_handler(void* user_data,
                                                          iree_status_t status) {
   iree_hal_amdgpu_logical_device_t* logical_device =
       (iree_hal_amdgpu_logical_device_t*)user_data;
   IREE_TRACE_ZONE_BEGIN(z0);

   // Display the error in trace tooling.
   IREE_TRACE({
     char buffer[1024];
     iree_host_size_t buffer_length = 0;
     if (iree_status_format(status, sizeof(buffer), buffer, &buffer_length)) {
       IREE_TRACE_MESSAGE_DYNAMIC(ERROR, buffer, buffer_length);
     }
   });

   // Set the device sticky error status (if it is not already set).
   intptr_t current_value = 0;
   if (!iree_atomic_compare_exchange_strong(
           &logical_device->failure_status, &current_value, (intptr_t)status,
           iree_memory_order_acq_rel, iree_memory_order_relaxed)) {
     // Previous status was not OK; the sticky slot owns only the first failure.
     iree_status_free(status);
   }

   IREE_TRACE_ZONE_END(z0);
 }

 static void iree_hal_amdgpu_logical_device_translate_physical_options(
     const iree_hal_amdgpu_logical_device_options_t* options,
     const iree_hal_amdgpu_topology_t* topology,
     iree_hal_amdgpu_physical_device_options_t* out_options) {
   iree_hal_amdgpu_physical_device_options_initialize(out_options);
   out_options->device_block_pools.small.block_size =
       options->device_block_pools.small.block_size;
   out_options->device_block_pools.small.initial_capacity =
       options->device_block_pools.small.initial_capacity;
   out_options->device_block_pools.large.block_size =
       options->device_block_pools.large.block_size;
   out_options->device_block_pools.large.initial_capacity =
       options->device_block_pools.large.initial_capacity;
   out_options->default_pool.range_length = options->default_pool.range_length;
   out_options->default_pool.alignment = options->default_pool.alignment;
   out_options->default_pool.frontier_capacity =
       options->default_pool.frontier_capacity;
   out_options->host_block_pool_initial_capacity =
       options->preallocate_pools ? 16 : 0;
   out_options->host_queue_count = topology->gpu_agent_queue_count;
   out_options->host_queue_aql_capacity = options->host_queues.aql_capacity;
   out_options->host_queue_notification_capacity =
       options->host_queues.notification_capacity;
   out_options->host_queue_kernarg_capacity =
       options->host_queues.kernarg_capacity;
   out_options->force_wait_barrier_defer = options->force_wait_barrier_defer;
 }

 static iree_status_t iree_hal_amdgpu_logical_device_verify_physical_options(
     const iree_hal_amdgpu_physical_device_options_t* options,
     const iree_hal_amdgpu_libhsa_t* libhsa,
     const iree_hal_amdgpu_topology_t* topology) {
   for (iree_host_size_t i = 0; i < topology->gpu_agent_count; ++i) {
     hsa_agent_t gpu_agent = topology->gpu_agents[i];
     hsa_agent_t cpu_agent = topology->cpu_agents[topology->gpu_cpu_map[i]];
     IREE_RETURN_IF_ERROR(
         iree_hal_amdgpu_physical_device_options_verify(options, libhsa,
                                                        cpu_agent, gpu_agent),
         "verifying GPU agent %" PRIhsz " meets required options", i);
   }
   return iree_ok_status();
 }

 static iree_status_t iree_hal_amdgpu_logical_device_allocate_storage(
     iree_string_view_t identifier, const iree_hal_amdgpu_topology_t* topology,
     iree_host_size_t physical_device_size, iree_allocator_t host_allocator,
     iree_hal_amdgpu_logical_device_t** out_logical_device) {
   *out_logical_device = NULL;

   iree_hal_amdgpu_logical_device_t* logical_device = NULL;
   iree_host_size_t physical_device_data_offset = 0;
   iree_host_size_t identifier_offset = 0;
   iree_host_size_t total_size = 0;
   IREE_RETURN_IF_ERROR(IREE_STRUCT_LAYOUT(
       sizeof(*logical_device), &total_size,
       IREE_STRUCT_FIELD(topology->gpu_agent_count,
                         iree_hal_amdgpu_physical_device_t*, NULL),
       IREE_STRUCT_ARRAY_FIELD_ALIGNED(
           topology->gpu_agent_count, physical_device_size, uint8_t,
           iree_max_align_t, &physical_device_data_offset),
       IREE_STRUCT_FIELD(identifier.size, char, &identifier_offset)));

   const iree_hal_amdgpu_queue_affinity_domain_t queue_affinity_domain = {
       .supported_affinity = IREE_HAL_QUEUE_AFFINITY_ANY,
       .physical_device_count = topology->gpu_agent_count,
       .queue_count_per_physical_device = topology->gpu_agent_queue_count,
   };
   iree_hal_queue_affinity_t logical_queue_affinity_mask = 0;
   for (iree_host_size_t i = 0; i < topology->gpu_agent_count; ++i) {
     iree_hal_queue_affinity_t physical_device_affinity = 0;
     IREE_RETURN_IF_ERROR(iree_hal_amdgpu_queue_affinity_for_physical_device(
         queue_affinity_domain, i, &physical_device_affinity));
     iree_hal_queue_affinity_or_into(logical_queue_affinity_mask,
                                     physical_device_affinity);
   }

   IREE_RETURN_IF_ERROR(iree_allocator_malloc(host_allocator, total_size,
                                              (void**)&logical_device));
   memset(logical_device, 0, total_size);
   iree_hal_resource_initialize(&iree_hal_amdgpu_logical_device_vtable,
                                &logical_device->resource);
   iree_string_view_append_to_buffer(identifier, &logical_device->identifier,
                                     (char*)logical_device + identifier_offset);
   logical_device->host_allocator = host_allocator;
   logical_device->failure_status = IREE_ATOMIC_VAR_INIT(0);
   iree_atomic_store(&logical_device->epoch, 0, iree_memory_order_relaxed);
   logical_device->next_profile_session_id = 1;
   iree_hal_amdgpu_profile_metadata_initialize(
       host_allocator, &logical_device->profile_metadata);
   iree_hal_amdgpu_profile_event_streams_initialize(
       &logical_device->profiling.event_streams);

   // Setup physical device table first so failure cleanup has a valid table.
   logical_device->physical_device_count = topology->gpu_agent_count;
   logical_device->queue_affinity_mask = logical_queue_affinity_mask;
   uint8_t* physical_device_base =
       (uint8_t*)logical_device + physical_device_data_offset;
   for (iree_host_size_t i = 0; i < logical_device->physical_device_count; ++i) {
     logical_device->physical_devices[i] =
         (iree_hal_amdgpu_physical_device_t*)physical_device_base;
     physical_device_base += physical_device_size;
   }

   *out_logical_device = logical_device;
   return iree_ok_status();
 }

 static iree_status_t iree_hal_amdgpu_logical_device_initialize_host_resources(
     iree_hal_amdgpu_logical_device_t* logical_device,
     const iree_hal_amdgpu_logical_device_options_t* options,
     iree_async_proactor_pool_t* proactor_pool,
     iree_allocator_t host_allocator) {
   logical_device->proactor_pool = proactor_pool;
   iree_async_proactor_pool_retain(logical_device->proactor_pool);

   iree_arena_block_pool_initialize(options->host_block_pools.small.block_size,
                                    host_allocator,
                                    &logical_device->host_block_pools.small);
   iree_arena_block_pool_initialize(options->host_block_pools.large.block_size,
                                    host_allocator,
                                    &logical_device->host_block_pools.large);
   IREE_RETURN_IF_ERROR(iree_hal_amdgpu_aql_program_block_pool_initialize(
       options->host_block_pools.command_buffer.usable_block_size,
       host_allocator, &logical_device->host_block_pools.command_buffer));
   return iree_async_proactor_pool_get(logical_device->proactor_pool, 0,
                                       &logical_device->proactor);
 }

 static iree_status_t
 iree_hal_amdgpu_logical_device_initialize_system_and_allocator(
     iree_hal_amdgpu_logical_device_t* logical_device,
     const iree_hal_amdgpu_logical_device_options_t* options,
     const iree_hal_amdgpu_libhsa_t* libhsa,
     const iree_hal_amdgpu_topology_t* topology,
     iree_allocator_t host_allocator) {
   iree_hal_amdgpu_system_options_t system_options = {
       .exclusive_execution = options->exclusive_execution,
   };
   IREE_RETURN_IF_ERROR(
       iree_hal_amdgpu_system_allocate(libhsa, topology, system_options,
                                       host_allocator, &logical_device->system));
   return iree_hal_amdgpu_allocator_create(
       logical_device, &logical_device->system->libhsa,
       &logical_device->system->topology, host_allocator,
       &logical_device->device_allocator);
 }

 static iree_status_t iree_hal_amdgpu_logical_device_initialize_physical_devices(
     iree_hal_amdgpu_logical_device_t* logical_device,
     const iree_hal_amdgpu_topology_t* topology,
     const iree_hal_amdgpu_physical_device_options_t* options,
     iree_allocator_t host_allocator) {
   for (iree_host_size_t device_ordinal = 0;
        device_ordinal < logical_device->physical_device_count;
        ++device_ordinal) {
     const iree_host_size_t host_ordinal = topology->gpu_cpu_map[device_ordinal];
     IREE_RETURN_IF_ERROR(iree_hal_amdgpu_physical_device_initialize(
         (iree_hal_device_t*)logical_device, logical_device->system, options,
         logical_device->proactor, host_ordinal,
         &logical_device->system->host_memory_pools[host_ordinal],
         device_ordinal, host_allocator,
         logical_device->physical_devices[device_ordinal]));
   }
   return iree_ok_status();
 }

 static iree_status_t iree_hal_amdgpu_logical_device_warmup_host_pools(
     iree_hal_amdgpu_logical_device_t* logical_device) {
   IREE_RETURN_IF_ERROR(iree_arena_block_pool_preallocate(
       &logical_device->host_block_pools.small, 16));
   IREE_RETURN_IF_ERROR(iree_arena_block_pool_preallocate(
       &logical_device->host_block_pools.large, 16));
   return iree_arena_block_pool_preallocate(
       &logical_device->host_block_pools.command_buffer, 16);
 }

 iree_status_t iree_hal_amdgpu_logical_device_create(
     iree_string_view_t identifier,
     const iree_hal_amdgpu_logical_device_options_t* options,
     const iree_hal_amdgpu_libhsa_t* libhsa,
     const iree_hal_amdgpu_topology_t* topology,
     const iree_hal_device_create_params_t* create_params,
     iree_allocator_t host_allocator, iree_hal_device_t** out_device) {
   IREE_ASSERT_ARGUMENT(options);
   IREE_ASSERT_ARGUMENT(create_params);
   IREE_ASSERT_ARGUMENT(create_params->proactor_pool);
   IREE_ASSERT_ARGUMENT(out_device);
   IREE_TRACE_ZONE_BEGIN(z0);
   *out_device = NULL;

   // Verify the topology is valid for a logical device.
   // This may have already been performed by the caller but doing it here
   // ensures all code paths must verify prior to creating a device.
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
       z0, iree_hal_amdgpu_topology_verify(topology, libhsa),
       "verifying topology");

   // Verify the parameters prior to creating resources.
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
       z0,
       iree_hal_amdgpu_logical_device_options_verify(options, libhsa, topology),
       "verifying logical device options");

   iree_hal_amdgpu_physical_device_options_t physical_device_options = {0};
   iree_hal_amdgpu_logical_device_translate_physical_options(
       options, topology, &physical_device_options);

   // Verify all GPU agents meet the required physical device options. Each
   // embedded physical device has the same layout because all physical devices
   // in one logical device share the same host-queue options.
   const iree_host_size_t physical_device_size =
       iree_hal_amdgpu_physical_device_calculate_size(&physical_device_options);
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
       z0,
       iree_hal_amdgpu_logical_device_verify_physical_options(
           &physical_device_options, libhsa, topology),
       "verifying physical device options");

   // Allocate the logical device and all nested physical device data structures.
   iree_hal_amdgpu_logical_device_t* logical_device = NULL;
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
       z0, iree_hal_amdgpu_logical_device_allocate_storage(
               identifier, topology, physical_device_size, host_allocator,
               &logical_device));
   iree_status_t status =
       iree_hal_amdgpu_logical_device_initialize_host_resources(
           logical_device, options, create_params->proactor_pool,
           host_allocator);
   if (iree_status_is_ok(status)) {
     status = iree_hal_amdgpu_logical_device_initialize_system_and_allocator(
         logical_device, options, libhsa, topology, host_allocator);
   }
   if (iree_status_is_ok(status)) {
     status = iree_hal_amdgpu_logical_device_initialize_physical_devices(
         logical_device, topology, &physical_device_options, host_allocator);
   }

   // If requested then warmup pools that we expect to grow on the first usage of
   // the backend. The first use may need more than the warmup provides here but
   // that's ok - users can warmup if they want.
   if (iree_status_is_ok(status) && options->preallocate_pools) {
     status = iree_hal_amdgpu_logical_device_warmup_host_pools(logical_device);
   }

   if (iree_status_is_ok(status)) {
     *out_device = (iree_hal_device_t*)logical_device;
   } else {
     iree_hal_device_release((iree_hal_device_t*)logical_device);
   }
   IREE_TRACE_ZONE_END(z0);
   return status;
 }

 static void iree_hal_amdgpu_logical_device_destroy(
     iree_hal_device_t* base_device) {
   iree_hal_amdgpu_logical_device_t* logical_device =
       iree_hal_amdgpu_logical_device_cast(base_device);
   iree_allocator_t host_allocator = iree_hal_device_host_allocator(base_device);
   IREE_TRACE_ZONE_BEGIN(z0);

   iree_hal_amdgpu_profile_counter_session_t* counter_session =
       logical_device->profiling.counter_session;
   iree_hal_amdgpu_profile_trace_session_t* trace_session =
       logical_device->profiling.trace_session;
   if (trace_session) {
     for (iree_host_size_t i = 0; i < logical_device->physical_device_count;
          ++i) {
       iree_hal_amdgpu_physical_device_t* physical_device =
           logical_device->physical_devices[i];
       for (iree_host_size_t j = 0; j < physical_device->host_queue_count; ++j) {
         iree_hal_amdgpu_host_queue_disable_profile_traces(
             &physical_device->host_queues[j]);
       }
     }
     logical_device->profiling.trace_session = NULL;
     iree_hal_amdgpu_profile_trace_session_free(trace_session);
   }
   if (counter_session) {
     for (iree_host_size_t i = 0; i < logical_device->physical_device_count;
          ++i) {
       iree_hal_amdgpu_physical_device_t* physical_device =
           logical_device->physical_devices[i];
       for (iree_host_size_t j = 0; j < physical_device->host_queue_count; ++j) {
         iree_hal_amdgpu_host_queue_disable_profile_counters(
             &physical_device->host_queues[j]);
       }
     }
     logical_device->profiling.counter_session = NULL;
     iree_hal_amdgpu_profile_counter_session_free(counter_session);
   }
   iree_hal_amdgpu_logical_device_reset_profile_options(logical_device);
   logical_device->profiling.session_id = 0;
   iree_hal_amdgpu_profile_event_streams_deinitialize(
       &logical_device->profiling.event_streams, logical_device->host_allocator);

   iree_hal_amdgpu_logical_device_deassign_frontier(logical_device);

   // Devices may hold allocations and need to be cleaned up first.
   for (iree_host_size_t i = 0; i < logical_device->physical_device_count; ++i) {
     iree_hal_amdgpu_physical_device_deinitialize(
         logical_device->physical_devices[i]);
   }

   iree_hal_allocator_release(logical_device->device_allocator);
   iree_hal_channel_provider_release(logical_device->channel_provider);

   // This may unload HSA; must come after all resources are released.
   iree_hal_amdgpu_system_free(logical_device->system);

   iree_hal_amdgpu_profile_metadata_deinitialize(
       &logical_device->profile_metadata);

   // Note that these may be used by other child data types and must be freed
   // last.
   iree_arena_block_pool_deinitialize(&logical_device->host_block_pools.small);
   iree_arena_block_pool_deinitialize(&logical_device->host_block_pools.large);
   iree_arena_block_pool_deinitialize(
       &logical_device->host_block_pools.command_buffer);

   iree_async_proactor_pool_release(logical_device->proactor_pool);

   iree_allocator_free(host_allocator, logical_device);

   IREE_TRACE_ZONE_END(z0);
 }

 static iree_string_view_t iree_hal_amdgpu_logical_device_id(
     iree_hal_device_t* base_device) {
   iree_hal_amdgpu_logical_device_t* logical_device =
       iree_hal_amdgpu_logical_device_cast(base_device);
   return logical_device->identifier;
 }

 static iree_allocator_t iree_hal_amdgpu_logical_device_host_allocator(
     iree_hal_device_t* base_device) {
   iree_hal_amdgpu_logical_device_t* logical_device =
       iree_hal_amdgpu_logical_device_cast(base_device);
   return logical_device->host_allocator;
 }

 static iree_hal_allocator_t* iree_hal_amdgpu_logical_device_allocator(
     iree_hal_device_t* base_device) {
   iree_hal_amdgpu_logical_device_t* logical_device =
       iree_hal_amdgpu_logical_device_cast(base_device);
   return logical_device->device_allocator;
 }

 static void iree_hal_amdgpu_replace_device_allocator(
     iree_hal_device_t* base_device, iree_hal_allocator_t* new_allocator) {
   iree_hal_amdgpu_logical_device_t* logical_device =
       iree_hal_amdgpu_logical_device_cast(base_device);
   iree_hal_allocator_retain(new_allocator);
   iree_hal_allocator_release(logical_device->device_allocator);
   logical_device->device_allocator = new_allocator;
 }

 static void iree_hal_amdgpu_replace_channel_provider(
     iree_hal_device_t* base_device, iree_hal_channel_provider_t* new_provider) {
   iree_hal_amdgpu_logical_device_t* logical_device =
       iree_hal_amdgpu_logical_device_cast(base_device);
   iree_hal_channel_provider_retain(new_provider);
   iree_hal_channel_provider_release(logical_device->channel_provider);
   logical_device->channel_provider = new_provider;
 }

 static iree_status_t iree_hal_amdgpu_logical_device_trim(
     iree_hal_device_t* base_device) {
   iree_hal_amdgpu_logical_device_t* logical_device =
       iree_hal_amdgpu_logical_device_cast(base_device);

   // Release pooled resources from each physical device. These may return items
   // back to the parent logical device pools.
   for (iree_host_size_t i = 0; i < logical_device->physical_device_count; ++i) {
     IREE_RETURN_IF_ERROR(iree_hal_amdgpu_physical_device_trim(
         logical_device->physical_devices[i]));
   }

   // Trim the allocator pools, if any.
   IREE_RETURN_IF_ERROR(
       iree_hal_allocator_trim(logical_device->device_allocator));

   // Trim host pools.
   iree_arena_block_pool_trim(&logical_device->host_block_pools.small);
   iree_arena_block_pool_trim(&logical_device->host_block_pools.large);
   iree_arena_block_pool_trim(&logical_device->host_block_pools.command_buffer);

   return iree_ok_status();
 }

 static iree_status_t iree_hal_amdgpu_logical_device_query_i64(
     iree_hal_device_t* base_device, iree_string_view_t category,
     iree_string_view_t key, int64_t* out_value) {
   iree_hal_amdgpu_logical_device_t* logical_device =
       iree_hal_amdgpu_logical_device_cast(base_device);
   *out_value = 0;

   if (iree_string_view_equal(category, IREE_SV("hal.device.id"))) {
     // NOTE: this is a fuzzy match and can allow a program to work with multiple
     // device implementations.
     *out_value =
         iree_string_view_match_pattern(logical_device->identifier, key) ? 1 : 0;
     return iree_ok_status();
   }

   iree_hal_amdgpu_system_t* system = logical_device->system;

   if (iree_string_view_equal(category, IREE_SV("hal.executable.format"))) {
     bool is_supported = false;
     IREE_RETURN_IF_ERROR(iree_hal_amdgpu_executable_format_supported(
         &system->libhsa, system->topology.gpu_agents[0], key, &is_supported,
         /*out_isa=*/NULL));
     *out_value = is_supported ? 1 : 0;
     return iree_ok_status();
   }

   if (iree_string_view_equal(category, IREE_SV("hal.device"))) {
     if (iree_string_view_equal(key, IREE_SV("concurrency"))) {
       *out_value = system->topology.gpu_agent_count *
                    system->topology.gpu_agent_queue_count;
       return iree_ok_status();
     }
   } else if (iree_string_view_equal(category, IREE_SV("hal.dispatch"))) {
     if (iree_string_view_equal(key, IREE_SV("concurrency"))) {
       uint32_t compute_unit_count = 0;
       IREE_RETURN_IF_ERROR(iree_hsa_agent_get_info(
           IREE_LIBHSA(&system->libhsa), system->topology.gpu_agents[0],
           (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT,
           &compute_unit_count));
       *out_value = compute_unit_count;
       return iree_ok_status();
     }
   }

   return iree_make_status(
       IREE_STATUS_NOT_FOUND,
       "unknown device configuration key value '%.*s :: %.*s'",
       (int)category.size, category.data, (int)key.size, key.data);
 }

 static iree_status_t iree_hal_amdgpu_logical_device_query_capabilities(
     iree_hal_device_t* base_device,
     iree_hal_device_capabilities_t* out_capabilities) {
   iree_hal_amdgpu_logical_device_t* logical_device =
       iree_hal_amdgpu_logical_device_cast(base_device);
   memset(out_capabilities, 0, sizeof(*out_capabilities));

   if (logical_device->physical_device_count == 0) {
     return iree_make_status(
         IREE_STATUS_INTERNAL,
         "logical device has no physical devices (initialization incomplete)");
   }

   // A multi-GPU logical device is a composite HAL device. Generic HAL topology
   // has only one node for it, so do not expose a physical-device-0 identity as
   // though it represented the entire composite. Exact internal physical device
   // identity is reported through AMDGPU profile/device metadata and queue
   // affinity records.
   const bool is_composite_device = logical_device->physical_device_count > 1;
   iree_hal_amdgpu_physical_device_t* physical_device =
       logical_device->physical_devices[0];

   memset(out_capabilities->physical_device_uuid, 0,
          sizeof(out_capabilities->physical_device_uuid));
   if (!is_composite_device && physical_device->has_physical_device_uuid) {
     memcpy(out_capabilities->physical_device_uuid,
            physical_device->physical_device_uuid,
            sizeof(out_capabilities->physical_device_uuid));
     out_capabilities->has_physical_device_uuid = true;
   }

   // Report a NUMA affinity only when the composite has a single nearest host
   // node that fits the generic HAL uint8_t representation. Mixed-NUMA
   // composites intentionally leave the default 0 because generic topology
   // cannot express one logical device spanning multiple CPU NUMA nodes.
   uint32_t host_numa_node = physical_device->host_numa_node;
   bool has_representative_numa_node = host_numa_node <= UINT8_MAX;
   for (iree_host_size_t i = 1; i < logical_device->physical_device_count &&
                                has_representative_numa_node;
        ++i) {
     has_representative_numa_node =
         logical_device->physical_devices[i]->host_numa_node == host_numa_node;
   }
   if (has_representative_numa_node) {
     out_capabilities->numa_node = (uint8_t)host_numa_node;
   }

   // External handle types (DMA-BUF support from system info).
   if (logical_device->system->info.dmabuf_supported) {
     out_capabilities->buffer_export_types |=
         IREE_HAL_TOPOLOGY_HANDLE_TYPE_DMA_BUF;
     out_capabilities->buffer_import_types |=
         IREE_HAL_TOPOLOGY_HANDLE_TYPE_DMA_BUF;
   }

   // Capability flags.
   if (logical_device->system->info.svm_accessible_by_default) {
     out_capabilities->flags |= IREE_HAL_DEVICE_CAPABILITY_UNIFIED_MEMORY;
   }

   // AMDGPU semaphores are native async timeline semaphores (not binary
   // emulation).
   out_capabilities->flags |= IREE_HAL_DEVICE_CAPABILITY_TIMELINE_SEMAPHORES;

   // Fine-grained memory provides host coherency without explicit flushes.
   // Coarse-grained memory requires fences, but the driver manages that
   // transparently.
   out_capabilities->flags |= IREE_HAL_DEVICE_CAPABILITY_HOST_COHERENT;

   // All AMDGPU devices support device-scope atomics. System-scope atomics are
   // supported on fine-grained memory when callers explicitly opt into
   // host-visible placement.
   out_capabilities->flags |= IREE_HAL_DEVICE_CAPABILITY_ATOMIC_SCOPE_DEVICE;
   out_capabilities->flags |= IREE_HAL_DEVICE_CAPABILITY_ATOMIC_SCOPE_SYSTEM;

   // All AMD GPUs support peer-to-peer DMA (through XGMI or PCIe). The actual
   // access mode for a specific GPU pair is determined by
   // refine_topology_edge — here we declare the capability in principle.
   out_capabilities->flags |= IREE_HAL_DEVICE_CAPABILITY_P2P_COPY;

   // Peer addressability depends on whether SVM is enabled (large BAR / XGMI
   // provides load/store access to peer memory without explicit grants).
   if (logical_device->system->info.svm_accessible_by_default) {
     out_capabilities->flags |= IREE_HAL_DEVICE_CAPABILITY_PEER_ADDRESSABLE;
     // SVM implies peer coherency on fine-grained memory.
     out_capabilities->flags |= IREE_HAL_DEVICE_CAPABILITY_PEER_COHERENT;
   }

   // Driver handle (HSA agent handle for same-driver refinement). Composite
   // devices intentionally leave this unset: a single HSA agent handle would
   // make generic topology alias detection treat a composite as one GPU.
   if (!is_composite_device) {
     out_capabilities->driver_device_handle =
         (uintptr_t)physical_device->device_agent.handle;
   }

   return iree_ok_status();
 }

 static const iree_hal_device_topology_info_t*
 iree_hal_amdgpu_logical_device_topology_info(iree_hal_device_t* base_device) {
   iree_hal_amdgpu_logical_device_t* logical_device =
       iree_hal_amdgpu_logical_device_cast(base_device);
   return &logical_device->topology_info;
 }

 // Maximum number of HSA memory-pool link hops we will stack-allocate.
 #define IREE_HAL_AMDGPU_MAX_TOPOLOGY_LINK_HOPS 16

 typedef struct iree_hal_amdgpu_physical_topology_edge_t {
   // Source-agent access to the destination coarse-grained memory pool.
   hsa_amd_memory_pool_access_t coarse_access;
   // Source-agent access to the destination fine-grained memory pool.
   hsa_amd_memory_pool_access_t fine_access;
   // True when |coarse_access| permits some direct device access.
   bool coarse_accessible;
   // True when |fine_access| permits some direct device access.
   bool fine_accessible;
   // True when every HSA-reported link hop supports coherent transactions.
   bool all_hops_coherent;
   // True when every HSA-reported link hop supports 32-bit atomics.
   bool all_hops_atomic_32bit;
   // True when every HSA-reported link hop supports 64-bit atomics.
   bool all_hops_atomic_64bit;
   // Worst physical link class across the reported HSA link hops.
   iree_hal_topology_link_class_t link_class;
   // Conservative copy-cost class derived from |link_class|.
   uint8_t copy_cost;
   // Conservative latency class derived from |link_class|.
   uint8_t latency_class;
   // Worst normalized NUMA distance reported by HSA link hops.
   uint8_t numa_distance;
 } iree_hal_amdgpu_physical_topology_edge_t;

 typedef struct iree_hal_amdgpu_topology_edge_aggregate_t {
   // Conservatively intersected capabilities valid for every physical pair.
   iree_hal_topology_capability_t physical_capabilities;
   // Worst non-coherent read mode across all physical pairs.
   iree_hal_topology_interop_mode_t noncoherent_read_mode;
   // Worst non-coherent write mode across all physical pairs.
   iree_hal_topology_interop_mode_t noncoherent_write_mode;
   // Worst coherent read mode across all physical pairs.
   iree_hal_topology_interop_mode_t coherent_read_mode;
   // Worst coherent write mode across all physical pairs.
   iree_hal_topology_interop_mode_t coherent_write_mode;
   // Worst link class across all physical pairs.
   iree_hal_topology_link_class_t link_class;
   // Worst copy-cost class across all physical pairs.
   uint8_t copy_cost;
   // Worst latency class across all physical pairs.
   uint8_t latency_class;
   // Worst normalized NUMA distance across all physical pairs.
   uint8_t numa_distance;
 } iree_hal_amdgpu_topology_edge_aggregate_t;

 // Maps an HSA link type to a HAL topology link class.
 // For multi-hop links, the caller should take the worst (highest) class.
 static iree_hal_topology_link_class_t iree_hal_amdgpu_link_type_to_link_class(
     hsa_amd_link_info_type_t link_type) {
   switch (link_type) {
     case HSA_AMD_LINK_INFO_TYPE_XGMI:
       return IREE_HAL_TOPOLOGY_LINK_CLASS_NVLINK_IF;
     case HSA_AMD_LINK_INFO_TYPE_PCIE:
       return IREE_HAL_TOPOLOGY_LINK_CLASS_PCIE_SAME_ROOT;
     case HSA_AMD_LINK_INFO_TYPE_QPI:
     case HSA_AMD_LINK_INFO_TYPE_HYPERTRANSPORT:
       // Cross-socket interconnects — treat as cross-root PCIe.
       return IREE_HAL_TOPOLOGY_LINK_CLASS_PCIE_CROSS_ROOT;
     case HSA_AMD_LINK_INFO_TYPE_INFINBAND:
       return IREE_HAL_TOPOLOGY_LINK_CLASS_FABRIC;
     default:
       return IREE_HAL_TOPOLOGY_LINK_CLASS_OTHER;
   }
 }

 static void iree_hal_amdgpu_topology_costs_from_link_class(
     iree_hal_topology_link_class_t link_class, uint8_t* out_copy_cost,
     uint8_t* out_latency_class) {
   switch (link_class) {
     case IREE_HAL_TOPOLOGY_LINK_CLASS_SAME_DIE:
       *out_copy_cost = 0;
       *out_latency_class = 0;
       break;
     case IREE_HAL_TOPOLOGY_LINK_CLASS_NVLINK_IF:
       *out_copy_cost = 3;
       *out_latency_class = 3;
       break;
     case IREE_HAL_TOPOLOGY_LINK_CLASS_PCIE_SAME_ROOT:
       *out_copy_cost = 7;
       *out_latency_class = 7;
       break;
     case IREE_HAL_TOPOLOGY_LINK_CLASS_PCIE_CROSS_ROOT:
       *out_copy_cost = 9;
       *out_latency_class = 9;
       break;
     case IREE_HAL_TOPOLOGY_LINK_CLASS_HOST_STAGED:
       *out_copy_cost = 13;
       *out_latency_class = 11;
       break;
     case IREE_HAL_TOPOLOGY_LINK_CLASS_FABRIC:
       *out_copy_cost = 15;
       *out_latency_class = 14;
       break;
     case IREE_HAL_TOPOLOGY_LINK_CLASS_ISOLATED:
       *out_copy_cost = 15;
       *out_latency_class = 15;
       break;
     default:
       *out_copy_cost = 11;
       *out_latency_class = 10;
       break;
   }
 }

 static uint8_t iree_hal_amdgpu_topology_scale_hsa_numa_distance(
     uint32_t hsa_numa_distance) {
   if (hsa_numa_distance == 0) return 0;
   uint32_t scaled = hsa_numa_distance > 10 ? (hsa_numa_distance - 10) / 2 : 0;
   return (uint8_t)iree_min(scaled, 15u);
 }

 static bool iree_hal_amdgpu_memory_pool_access_is_valid(
     hsa_amd_memory_pool_access_t access) {
   switch (access) {
     case HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED:
     case HSA_AMD_MEMORY_POOL_ACCESS_ALLOWED_BY_DEFAULT:
     case HSA_AMD_MEMORY_POOL_ACCESS_DISALLOWED_BY_DEFAULT:
       return true;
     default:
       return false;
   }
 }

 static iree_status_t iree_hal_amdgpu_validate_memory_pool_access(
     hsa_amd_memory_pool_access_t access, const char* pool_kind) {
   if (IREE_LIKELY(iree_hal_amdgpu_memory_pool_access_is_valid(access))) {
     return iree_ok_status();
   }
   return iree_make_status(IREE_STATUS_OUT_OF_RANGE,
                           "HSA reported unknown %s memory pool access mode %u",
                           pool_kind, (uint32_t)access);
 }

 static iree_hal_topology_interop_mode_t
 iree_hal_amdgpu_topology_mode_from_memory_pool_access(
     hsa_amd_memory_pool_access_t access,
     iree_hal_topology_interop_mode_t base_mode) {
   switch (access) {
     case HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED:
       return IREE_HAL_TOPOLOGY_INTEROP_MODE_COPY;
     case HSA_AMD_MEMORY_POOL_ACCESS_ALLOWED_BY_DEFAULT:
       return IREE_HAL_TOPOLOGY_INTEROP_MODE_NATIVE;
     case HSA_AMD_MEMORY_POOL_ACCESS_DISALLOWED_BY_DEFAULT:
     default:
       return base_mode;
   }
 }

 static iree_hal_topology_capability_t
 iree_hal_amdgpu_physical_topology_capabilities(
     const iree_hal_amdgpu_physical_topology_edge_t* physical_edge) {
   iree_hal_topology_capability_t capabilities =
       IREE_HAL_TOPOLOGY_CAPABILITY_NONE;
   if (!physical_edge->coarse_accessible && !physical_edge->fine_accessible) {
     return capabilities;
   }
   capabilities |= IREE_HAL_TOPOLOGY_CAPABILITY_P2P_COPY;
   if (physical_edge->all_hops_coherent) {
     capabilities |= IREE_HAL_TOPOLOGY_CAPABILITY_PEER_COHERENT;
   }
   if (physical_edge->all_hops_atomic_32bit) {
     capabilities |= IREE_HAL_TOPOLOGY_CAPABILITY_ATOMIC_DEVICE;
   }
   if (physical_edge->all_hops_atomic_64bit) {
     capabilities |= IREE_HAL_TOPOLOGY_CAPABILITY_ATOMIC_SYSTEM;
   }
   return capabilities;
 }

 static void iree_hal_amdgpu_physical_topology_edge_initialize(
     iree_hal_amdgpu_physical_topology_edge_t* out_physical_edge) {
   memset(out_physical_edge, 0, sizeof(*out_physical_edge));
   out_physical_edge->coarse_access = HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED;
   out_physical_edge->fine_access = HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED;
   out_physical_edge->all_hops_coherent = true;
   out_physical_edge->all_hops_atomic_32bit = true;
   out_physical_edge->all_hops_atomic_64bit = true;
   out_physical_edge->link_class = IREE_HAL_TOPOLOGY_LINK_CLASS_SAME_DIE;
 }

 static iree_status_t iree_hal_amdgpu_query_physical_topology_edge(
     const iree_hal_amdgpu_libhsa_t* libhsa,
     const iree_hal_amdgpu_physical_device_t* source_physical_device,
     const iree_hal_amdgpu_physical_device_t* destination_physical_device,
     iree_hal_amdgpu_physical_topology_edge_t* out_physical_edge) {
   iree_hal_amdgpu_physical_topology_edge_initialize(out_physical_edge);

   hsa_agent_t source_agent = source_physical_device->device_agent;
   hsa_agent_t destination_agent = destination_physical_device->device_agent;

   // Find both memory pool types on the destination agent. Not all devices
   // expose both pool types; missing pools are treated as NEVER_ALLOWED for that
   // pool kind, but an agent with no global pool at all is not a usable topology
   // node.
   hsa_amd_memory_pool_t dst_coarse_pool = {0};
   bool has_coarse_pool = iree_hal_amdgpu_try_find_coarse_global_memory_pool(
       libhsa, destination_agent, &dst_coarse_pool);
   hsa_amd_memory_pool_t dst_fine_pool = {0};
   bool has_fine_pool = iree_hal_amdgpu_try_find_fine_global_memory_pool(
       libhsa, destination_agent, &dst_fine_pool);
   if (!has_coarse_pool && !has_fine_pool) {
     return iree_make_status(
         IREE_STATUS_UNAVAILABLE,
         "destination agent has neither coarse nor fine global memory pool");
   }

   if (has_coarse_pool) {
     IREE_RETURN_IF_ERROR(iree_hsa_amd_agent_memory_pool_get_info(
         IREE_LIBHSA(libhsa), source_agent, dst_coarse_pool,
         HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS,
         &out_physical_edge->coarse_access));
     IREE_RETURN_IF_ERROR(iree_hal_amdgpu_validate_memory_pool_access(
         out_physical_edge->coarse_access, "coarse"));
   }
   if (has_fine_pool) {
     IREE_RETURN_IF_ERROR(iree_hsa_amd_agent_memory_pool_get_info(
         IREE_LIBHSA(libhsa), source_agent, dst_fine_pool,
         HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS,
         &out_physical_edge->fine_access));
     IREE_RETURN_IF_ERROR(iree_hal_amdgpu_validate_memory_pool_access(
         out_physical_edge->fine_access, "fine"));
   }
   out_physical_edge->coarse_accessible =
       out_physical_edge->coarse_access !=
       HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED;
   out_physical_edge->fine_accessible = out_physical_edge->fine_access !=
                                        HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED;

   // Query link hop count and topology. The link topology describes the
   // interconnect between agents and is the same regardless of pool granularity;
   // use whichever pool is present, preferring coarse-grained memory.
   hsa_amd_memory_pool_t link_query_pool =
       has_coarse_pool ? dst_coarse_pool : dst_fine_pool;
   uint32_t hop_count = 0;
   IREE_RETURN_IF_ERROR(iree_hsa_amd_agent_memory_pool_get_info(
       IREE_LIBHSA(libhsa), source_agent, link_query_pool,
       HSA_AMD_AGENT_MEMORY_POOL_INFO_NUM_LINK_HOPS, &hop_count));
   if (hop_count > IREE_HAL_AMDGPU_MAX_TOPOLOGY_LINK_HOPS) {
     return iree_make_status(
         IREE_STATUS_OUT_OF_RANGE,
         "HSA reports %" PRIu32 " link hops between GPU agents (max %" PRIhsz
         ")",
         hop_count, (iree_host_size_t)IREE_HAL_AMDGPU_MAX_TOPOLOGY_LINK_HOPS);
   }

   if (hop_count > 0) {
     // The LINK_INFO query writes exactly hop_count entries into the caller's
     // buffer with no separate size parameter.
     hsa_amd_memory_pool_link_info_t
         link_info[IREE_HAL_AMDGPU_MAX_TOPOLOGY_LINK_HOPS];
     memset(link_info, 0, sizeof(link_info[0]) * hop_count);
     IREE_RETURN_IF_ERROR(iree_hsa_amd_agent_memory_pool_get_info(
         IREE_LIBHSA(libhsa), source_agent, link_query_pool,
         HSA_AMD_AGENT_MEMORY_POOL_INFO_LINK_INFO, link_info));

     for (uint32_t i = 0; i < hop_count; ++i) {
       iree_hal_topology_link_class_t hop_class =
           iree_hal_amdgpu_link_type_to_link_class(link_info[i].link_type);
       if (hop_class > out_physical_edge->link_class) {
         out_physical_edge->link_class = hop_class;
       }
       uint8_t numa_distance = iree_hal_amdgpu_topology_scale_hsa_numa_distance(
           link_info[i].numa_distance);
       if (numa_distance > out_physical_edge->numa_distance) {
         out_physical_edge->numa_distance = numa_distance;
       }
       if (!link_info[i].coherent_support) {
         out_physical_edge->all_hops_coherent = false;
       }
       if (!link_info[i].atomic_support_32bit) {
         out_physical_edge->all_hops_atomic_32bit = false;
       }
       if (!link_info[i].atomic_support_64bit) {
         out_physical_edge->all_hops_atomic_64bit = false;
       }
     }
   }

   if (!out_physical_edge->coarse_accessible &&
       !out_physical_edge->fine_accessible) {
     out_physical_edge->link_class = IREE_HAL_TOPOLOGY_LINK_CLASS_HOST_STAGED;
     out_physical_edge->all_hops_coherent = false;
     out_physical_edge->all_hops_atomic_32bit = false;
     out_physical_edge->all_hops_atomic_64bit = false;
   }

   iree_hal_amdgpu_topology_costs_from_link_class(
       out_physical_edge->link_class, &out_physical_edge->copy_cost,
       &out_physical_edge->latency_class);
   return iree_ok_status();
 }

 static void iree_hal_amdgpu_topology_edge_aggregate_initialize(
     iree_hal_topology_edge_t edge,
     iree_hal_amdgpu_topology_edge_aggregate_t* out_aggregate) {
   // Start physical facts at their best value so the aggregate can both upgrade
   // an imprecise base edge and then monotonically worsen with each pair.
   // Per-pair DISALLOWED_BY_DEFAULT access falls back to the base edge mode in
   // iree_hal_amdgpu_topology_edge_aggregate_include.
   out_aggregate->physical_capabilities =
       IREE_HAL_TOPOLOGY_CAPABILITY_P2P_COPY |
       IREE_HAL_TOPOLOGY_CAPABILITY_PEER_COHERENT |
       IREE_HAL_TOPOLOGY_CAPABILITY_ATOMIC_DEVICE |
       IREE_HAL_TOPOLOGY_CAPABILITY_ATOMIC_SYSTEM;
   out_aggregate->noncoherent_read_mode = IREE_HAL_TOPOLOGY_INTEROP_MODE_NATIVE;
   out_aggregate->noncoherent_write_mode = IREE_HAL_TOPOLOGY_INTEROP_MODE_NATIVE;
   out_aggregate->coherent_read_mode = IREE_HAL_TOPOLOGY_INTEROP_MODE_NATIVE;
   out_aggregate->coherent_write_mode = IREE_HAL_TOPOLOGY_INTEROP_MODE_NATIVE;
   out_aggregate->link_class = IREE_HAL_TOPOLOGY_LINK_CLASS_SAME_DIE;
   out_aggregate->copy_cost = 0;
   out_aggregate->latency_class = 0;
   out_aggregate->numa_distance = iree_hal_topology_edge_numa_distance(edge.lo);
 }

 static void iree_hal_amdgpu_topology_edge_aggregate_include(
     iree_hal_topology_edge_t base_edge,
     const iree_hal_amdgpu_physical_topology_edge_t* physical_edge,
     iree_hal_amdgpu_topology_edge_aggregate_t* aggregate) {
   aggregate->physical_capabilities &=
       iree_hal_amdgpu_physical_topology_capabilities(physical_edge);

   aggregate->noncoherent_read_mode = iree_max(
       aggregate->noncoherent_read_mode,
       iree_hal_amdgpu_topology_mode_from_memory_pool_access(
           physical_edge->coarse_access,
           iree_hal_topology_edge_buffer_read_mode_noncoherent(base_edge.lo)));
   aggregate->noncoherent_write_mode = iree_max(
       aggregate->noncoherent_write_mode,
       iree_hal_amdgpu_topology_mode_from_memory_pool_access(
           physical_edge->coarse_access,
           iree_hal_topology_edge_buffer_write_mode_noncoherent(base_edge.lo)));
   aggregate->coherent_read_mode = iree_max(
       aggregate->coherent_read_mode,
       iree_hal_amdgpu_topology_mode_from_memory_pool_access(
           physical_edge->fine_access,
           iree_hal_topology_edge_buffer_read_mode_coherent(base_edge.lo)));
   aggregate->coherent_write_mode = iree_max(
       aggregate->coherent_write_mode,
       iree_hal_amdgpu_topology_mode_from_memory_pool_access(
           physical_edge->fine_access,
           iree_hal_topology_edge_buffer_write_mode_coherent(base_edge.lo)));

   if (physical_edge->link_class > aggregate->link_class) {
     aggregate->link_class = physical_edge->link_class;
   }
   if (physical_edge->copy_cost > aggregate->copy_cost) {
     aggregate->copy_cost = physical_edge->copy_cost;
   }
   if (physical_edge->latency_class > aggregate->latency_class) {
     aggregate->latency_class = physical_edge->latency_class;
   }
   if (physical_edge->numa_distance > aggregate->numa_distance) {
     aggregate->numa_distance = physical_edge->numa_distance;
   }
 }

 static void iree_hal_amdgpu_topology_edge_apply_aggregate(
     const iree_hal_amdgpu_topology_edge_aggregate_t* aggregate,
     iree_hal_topology_edge_t* edge) {
   edge->lo = iree_hal_topology_edge_set_buffer_read_mode_noncoherent(
       edge->lo, aggregate->noncoherent_read_mode);
   edge->lo = iree_hal_topology_edge_set_buffer_write_mode_noncoherent(
       edge->lo, aggregate->noncoherent_write_mode);
   edge->lo = iree_hal_topology_edge_set_buffer_read_mode_coherent(
       edge->lo, aggregate->coherent_read_mode);
   edge->lo = iree_hal_topology_edge_set_buffer_write_mode_coherent(
       edge->lo, aggregate->coherent_write_mode);

   edge->lo =
       iree_hal_topology_edge_set_link_class(edge->lo, aggregate->link_class);
   edge->lo =
       iree_hal_topology_edge_set_copy_cost(edge->lo, aggregate->copy_cost);
   edge->lo = iree_hal_topology_edge_set_latency_class(edge->lo,
                                                       aggregate->latency_class);
   edge->lo = iree_hal_topology_edge_set_numa_distance(edge->lo,
                                                       aggregate->numa_distance);

   iree_hal_topology_capability_t capabilities =
       iree_hal_topology_edge_capability_flags(edge->lo);
   const iree_hal_topology_capability_t physical_capability_mask =
       IREE_HAL_TOPOLOGY_CAPABILITY_P2P_COPY |
       IREE_HAL_TOPOLOGY_CAPABILITY_PEER_COHERENT |
       IREE_HAL_TOPOLOGY_CAPABILITY_ATOMIC_DEVICE |
       IREE_HAL_TOPOLOGY_CAPABILITY_ATOMIC_SYSTEM;
   capabilities &= ~physical_capability_mask;
   capabilities |= aggregate->physical_capabilities & physical_capability_mask;
   edge->lo =
       iree_hal_topology_edge_set_capability_flags(edge->lo, capabilities);
 }

 static iree_status_t iree_hal_amdgpu_logical_device_refine_topology_edge(
     iree_hal_device_t* src_device, iree_hal_device_t* dst_device,
     iree_hal_topology_edge_t* edge) {
   iree_hal_amdgpu_logical_device_t* src_logical =
       iree_hal_amdgpu_logical_device_cast(src_device);
   iree_hal_amdgpu_logical_device_t* dst_logical =
       iree_hal_amdgpu_logical_device_cast(dst_device);
   const iree_hal_amdgpu_libhsa_t* libhsa = &src_logical->system->libhsa;
   if (src_logical->physical_device_count == 0 ||
       dst_logical->physical_device_count == 0) {
     return iree_make_status(
         IREE_STATUS_INTERNAL,
         "cannot refine AMDGPU topology edge with an empty physical device set");
   }

   iree_hal_amdgpu_topology_edge_aggregate_t aggregate;
   iree_hal_amdgpu_topology_edge_aggregate_initialize(*edge, &aggregate);

   // A composite logical device has one generic HAL topology node but several
   // physical HSA agents. The generic edge must be valid for any source/dest
   // physical pair because the scheduler cannot encode a subset-specific edge.
   for (iree_host_size_t source_index = 0;
        source_index < src_logical->physical_device_count; ++source_index) {
     const iree_hal_amdgpu_physical_device_t* source_physical_device =
         src_logical->physical_devices[source_index];
     for (iree_host_size_t destination_index = 0;
          destination_index < dst_logical->physical_device_count;
          ++destination_index) {
       const iree_hal_amdgpu_physical_device_t* destination_physical_device =
           dst_logical->physical_devices[destination_index];
       iree_hal_amdgpu_physical_topology_edge_t physical_edge;
       IREE_RETURN_IF_ERROR(iree_hal_amdgpu_query_physical_topology_edge(
           libhsa, source_physical_device, destination_physical_device,
           &physical_edge));
       iree_hal_amdgpu_topology_edge_aggregate_include(*edge, &physical_edge,
                                                       &aggregate);
     }
   }

   iree_hal_amdgpu_topology_edge_apply_aggregate(&aggregate, edge);
   return iree_ok_status();
 }

 static iree_status_t iree_hal_amdgpu_logical_device_assign_topology_info(
     iree_hal_device_t* base_device,
     const iree_hal_device_topology_info_t* topology_info) {
   iree_hal_amdgpu_logical_device_t* logical_device =
       iree_hal_amdgpu_logical_device_cast(base_device);
   if (!topology_info) {
     iree_hal_amdgpu_logical_device_deassign_frontier(logical_device);
     return iree_ok_status();
   }
   IREE_TRACE_ZONE_BEGIN(z0);
   iree_hal_amdgpu_system_t* system = logical_device->system;

   const uint8_t device_count = (uint8_t)system->topology.gpu_agent_count;
   const uint8_t queue_stride = (uint8_t)system->topology.gpu_agent_queue_count;
   const iree_host_size_t table_size =
       iree_hal_amdgpu_epoch_signal_table_size(device_count, queue_stride);
   iree_status_t status =
       iree_allocator_malloc(logical_device->host_allocator, table_size,
                             (void**)&logical_device->host_queue_epoch_table);
   if (iree_status_is_ok(status)) {
     iree_hal_amdgpu_epoch_signal_table_initialize(
         logical_device->host_queue_epoch_table,
         iree_async_axis_session(topology_info->frontier.base_axis),
         iree_async_axis_machine(topology_info->frontier.base_axis),
         device_count, queue_stride);
   }

   for (iree_host_size_t device_ordinal = 0;
        device_ordinal < logical_device->physical_device_count &&
        iree_status_is_ok(status);
        ++device_ordinal) {
     const iree_host_size_t host_ordinal =
         system->topology.gpu_cpu_map[device_ordinal];
     status = iree_hal_amdgpu_physical_device_assign_frontier(
         base_device, system, logical_device->proactor,
         topology_info->frontier.tracker, topology_info->frontier.base_axis,
         logical_device->host_queue_epoch_table,
         &system->host_memory_pools[host_ordinal],
         logical_device->host_allocator,
         logical_device->physical_devices[device_ordinal]);
   }

   if (iree_status_is_ok(status)) {
     logical_device->topology_info = *topology_info;
     logical_device->frontier_tracker = topology_info->frontier.tracker;
     logical_device->axis = topology_info->frontier.base_axis;
     iree_async_frontier_tracker_retain(logical_device->frontier_tracker);
   } else {
     iree_hal_amdgpu_logical_device_deassign_frontier(logical_device);
   }

   IREE_TRACE_ZONE_END(z0);
   return status;
 }

 static iree_status_t iree_hal_amdgpu_logical_device_create_channel(
     iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
     iree_hal_channel_params_t params, iree_hal_channel_t** out_channel) {
   return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
                           "AMDGPU collective channels not yet implemented");
 }

 static iree_status_t iree_hal_amdgpu_logical_device_create_command_buffer(
     iree_hal_device_t* base_device, iree_hal_command_buffer_mode_t mode,
     iree_hal_command_category_t command_categories,
     iree_hal_queue_affinity_t queue_affinity, iree_host_size_t binding_capacity,
     iree_hal_command_buffer_t** out_command_buffer) {
   iree_hal_amdgpu_logical_device_t* logical_device =
       iree_hal_amdgpu_logical_device_cast(base_device);
   iree_hal_queue_affinity_t effective_queue_affinity = 0;
   iree_host_size_t device_ordinal = 0;
   IREE_RETURN_IF_ERROR(
       iree_hal_amdgpu_logical_device_normalize_command_buffer_affinity(
           logical_device, queue_affinity, &effective_queue_affinity,
           &device_ordinal));
   const iree_hal_amdgpu_physical_device_t* physical_device =
       logical_device->physical_devices[device_ordinal];
   return iree_hal_amdgpu_aql_command_buffer_create(
       iree_hal_device_allocator(base_device), mode, command_categories,
       effective_queue_affinity, binding_capacity, device_ordinal,
       physical_device->prepublished_kernarg_storage,
       &logical_device->profile_metadata,
       &logical_device->host_block_pools.command_buffer,
       &logical_device->host_block_pools.small, logical_device->host_allocator,
       out_command_buffer);
 }

 static iree_status_t iree_hal_amdgpu_logical_device_create_event(
     iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
     iree_hal_event_flags_t flags, iree_hal_event_t** out_event) {
   return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
                           "AMDGPU events not yet implemented");
 }

 static iree_status_t iree_hal_amdgpu_logical_device_create_executable_cache(
     iree_hal_device_t* base_device, iree_string_view_t identifier,
     iree_hal_executable_cache_t** out_executable_cache) {
   iree_hal_amdgpu_logical_device_t* logical_device =
       iree_hal_amdgpu_logical_device_cast(base_device);
   return iree_hal_amdgpu_executable_cache_create(
       &logical_device->system->libhsa, &logical_device->system->topology,
       &logical_device->profile_metadata, identifier,
       iree_hal_device_host_allocator(base_device), out_executable_cache);
 }

 static iree_status_t iree_hal_amdgpu_logical_device_import_file(
     iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
     iree_hal_memory_access_t access, iree_io_file_handle_t* handle,
     iree_hal_external_file_flags_t flags, iree_hal_file_t** out_file) {
   iree_hal_amdgpu_logical_device_t* logical_device =
       iree_hal_amdgpu_logical_device_cast(base_device);

   IREE_RETURN_IF_ERROR(iree_hal_amdgpu_queue_affinity_normalize(
       logical_device->queue_affinity_mask, queue_affinity, &queue_affinity));

   return iree_hal_file_from_handle(
       iree_hal_device_allocator(base_device), queue_affinity, access, handle,
       logical_device->proactor, iree_hal_device_host_allocator(base_device),
       out_file);
 }

 static iree_status_t iree_hal_amdgpu_logical_device_create_semaphore(
     iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
     uint64_t initial_value, iree_hal_semaphore_flags_t flags,
     iree_hal_semaphore_t** out_semaphore) {
   iree_hal_amdgpu_logical_device_t* logical_device =
       iree_hal_amdgpu_logical_device_cast(base_device);
   return iree_hal_amdgpu_semaphore_create(
       logical_device, logical_device->proactor, queue_affinity, initial_value,
       flags, logical_device->host_allocator, out_semaphore);
 }

 static iree_hal_semaphore_compatibility_t
 iree_hal_amdgpu_logical_device_query_semaphore_compatibility(
     iree_hal_device_t* base_device, iree_hal_semaphore_t* semaphore) {
   if (iree_hal_amdgpu_semaphore_isa(semaphore)) {
     return IREE_HAL_SEMAPHORE_COMPATIBILITY_ALL;
   }
   return IREE_HAL_SEMAPHORE_COMPATIBILITY_HOST_ONLY;
 }

 static iree_status_t iree_hal_amdgpu_logical_device_query_queue_pool_backend(
     iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
     iree_hal_queue_pool_backend_t* out_backend) {
   iree_hal_amdgpu_logical_device_t* logical_device =
       iree_hal_amdgpu_logical_device_cast(base_device);
   iree_hal_amdgpu_physical_device_t* physical_device = NULL;
   IREE_RETURN_IF_ERROR(
       iree_hal_amdgpu_logical_device_select_queue_pool_physical_device(
           logical_device, queue_affinity, &physical_device));
   out_backend->slab_provider = physical_device->default_slab_provider;
   out_backend->notification = physical_device->default_pool_notification;
   out_backend->epoch_query = (iree_hal_pool_epoch_query_t){
       .fn = iree_hal_amdgpu_logical_device_query_pool_epoch,
       .user_data = logical_device,
   };
   return iree_ok_status();
 }

 static iree_status_t iree_hal_amdgpu_logical_device_queue_alloca(
     iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
     const iree_hal_semaphore_list_t wait_semaphore_list,
     const iree_hal_semaphore_list_t signal_semaphore_list,
     iree_hal_pool_t* pool, iree_hal_buffer_params_t params,
     iree_device_size_t allocation_size, iree_hal_alloca_flags_t flags,
     iree_hal_buffer_t** IREE_RESTRICT out_buffer) {
   iree_hal_amdgpu_logical_device_t* logical_device =
       iree_hal_amdgpu_logical_device_cast(base_device);
   iree_hal_amdgpu_virtual_queue_t* queue = NULL;
   IREE_RETURN_IF_ERROR(iree_hal_amdgpu_logical_device_select_host_queue(
       logical_device, queue_affinity, &queue));
   return queue->vtable->alloca(queue, wait_semaphore_list,
                                signal_semaphore_list, pool, params,
                                allocation_size, flags, out_buffer);
 }

 static iree_status_t iree_hal_amdgpu_logical_device_queue_dealloca(
     iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
     const iree_hal_semaphore_list_t wait_semaphore_list,
     const iree_hal_semaphore_list_t signal_semaphore_list,
     iree_hal_buffer_t* buffer, iree_hal_dealloca_flags_t flags) {
   iree_hal_amdgpu_logical_device_t* logical_device =
       iree_hal_amdgpu_logical_device_cast(base_device);
   iree_hal_amdgpu_virtual_queue_t* queue = NULL;
   IREE_RETURN_IF_ERROR(iree_hal_amdgpu_logical_device_select_host_queue(
       logical_device, queue_affinity, &queue));
   return queue->vtable->dealloca(queue, wait_semaphore_list,
                                  signal_semaphore_list, buffer, flags);
 }

 static iree_status_t iree_hal_amdgpu_logical_device_queue_fill(
     iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
     const iree_hal_semaphore_list_t wait_semaphore_list,
     const iree_hal_semaphore_list_t signal_semaphore_list,
     iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
     iree_device_size_t length, const void* pattern,
     iree_host_size_t pattern_length, iree_hal_fill_flags_t flags) {
   // Match the HAL contract documented on iree_hal_command_buffer_fill_buffer
   // (1/2/4-byte patterns only) so queue_fill and command_buffer_fill accept
   // the same inputs across all backends. The device kernel itself supports an
   // 8-byte pattern path via iree_hal_amdgpu_device_buffer_fill_x8, but we
   // deliberately do not expose that here — callers writing 8-byte fills would
   // then be portable only to amdgpu.
   if (IREE_UNLIKELY(pattern_length != 1 && pattern_length != 2 &&
                     pattern_length != 4)) {
     return iree_make_status(
         IREE_STATUS_INVALID_ARGUMENT,
         "fill patterns must be 1, 2, or 4 bytes (got %" PRIhsz ")",
         pattern_length);
   }
   if (IREE_UNLIKELY(!pattern)) {
     return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
                             "fill pattern pointer is required");
   }
   uint64_t pattern_bits = 0;
   memcpy(&pattern_bits, pattern, pattern_length);

   iree_hal_amdgpu_logical_device_t* logical_device =
       iree_hal_amdgpu_logical_device_cast(base_device);
   iree_hal_amdgpu_virtual_queue_t* queue = NULL;
   IREE_RETURN_IF_ERROR(iree_hal_amdgpu_logical_device_select_host_queue(
       logical_device, queue_affinity, &queue));
   return queue->vtable->fill(queue, wait_semaphore_list, signal_semaphore_list,
                              target_buffer, target_offset, length, pattern_bits,
                              pattern_length, flags);
 }

 static iree_status_t iree_hal_amdgpu_logical_device_queue_update(
     iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
     const iree_hal_semaphore_list_t wait_semaphore_list,
     const iree_hal_semaphore_list_t signal_semaphore_list,
     const void* source_buffer, iree_host_size_t source_offset,
     iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
     iree_device_size_t length, iree_hal_update_flags_t flags) {
   iree_hal_amdgpu_logical_device_t* logical_device =
       iree_hal_amdgpu_logical_device_cast(base_device);
   iree_hal_amdgpu_virtual_queue_t* queue = NULL;
   IREE_RETURN_IF_ERROR(iree_hal_amdgpu_logical_device_select_host_queue(
       logical_device, queue_affinity, &queue));
   return queue->vtable->update(
       queue, wait_semaphore_list, signal_semaphore_list, source_buffer,
       source_offset, target_buffer, target_offset, length, flags);
 }

 static iree_status_t iree_hal_amdgpu_logical_device_queue_copy(
     iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
     const iree_hal_semaphore_list_t wait_semaphore_list,
     const iree_hal_semaphore_list_t signal_semaphore_list,
     iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset,
     iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
     iree_device_size_t length, iree_hal_copy_flags_t flags) {
   iree_hal_amdgpu_logical_device_t* logical_device =
       iree_hal_amdgpu_logical_device_cast(base_device);
   iree_hal_amdgpu_virtual_queue_t* queue = NULL;
   IREE_RETURN_IF_ERROR(iree_hal_amdgpu_logical_device_select_host_queue(
       logical_device, queue_affinity, &queue));
   return queue->vtable->copy(queue, wait_semaphore_list, signal_semaphore_list,
                              source_buffer, source_offset, target_buffer,
                              target_offset, length, flags);
 }

 static iree_status_t iree_hal_amdgpu_logical_device_queue_read(
     iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
     const iree_hal_semaphore_list_t wait_semaphore_list,
     const iree_hal_semaphore_list_t signal_semaphore_list,
     iree_hal_file_t* source_file, uint64_t source_offset,
     iree_hal_buffer_t* target_buffer, iree_device_size_t target_offset,
     iree_device_size_t length, iree_hal_read_flags_t flags) {
   iree_hal_amdgpu_logical_device_t* logical_device =
       iree_hal_amdgpu_logical_device_cast(base_device);
   iree_hal_amdgpu_virtual_queue_t* queue = NULL;
   IREE_RETURN_IF_ERROR(iree_hal_amdgpu_logical_device_select_host_queue(
       logical_device, queue_affinity, &queue));
   return queue->vtable->read(queue, wait_semaphore_list, signal_semaphore_list,
                              source_file, source_offset, target_buffer,
                              target_offset, length, flags);
 }

 static iree_status_t iree_hal_amdgpu_logical_device_queue_write(
     iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
     const iree_hal_semaphore_list_t wait_semaphore_list,
     const iree_hal_semaphore_list_t signal_semaphore_list,
     iree_hal_buffer_t* source_buffer, iree_device_size_t source_offset,
     iree_hal_file_t* target_file, uint64_t target_offset,
     iree_device_size_t length, iree_hal_write_flags_t flags) {
   iree_hal_amdgpu_logical_device_t* logical_device =
       iree_hal_amdgpu_logical_device_cast(base_device);
   iree_hal_amdgpu_virtual_queue_t* queue = NULL;
   IREE_RETURN_IF_ERROR(iree_hal_amdgpu_logical_device_select_host_queue(
       logical_device, queue_affinity, &queue));
   return queue->vtable->write(queue, wait_semaphore_list, signal_semaphore_list,
                               source_buffer, source_offset, target_file,
                               target_offset, length, flags);
 }

 static iree_status_t iree_hal_amdgpu_logical_device_queue_host_call(
     iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
     const iree_hal_semaphore_list_t wait_semaphore_list,
     const iree_hal_semaphore_list_t signal_semaphore_list,
     iree_hal_host_call_t call, const uint64_t args[4],
     iree_hal_host_call_flags_t flags) {
   iree_hal_amdgpu_logical_device_t* logical_device =
       iree_hal_amdgpu_logical_device_cast(base_device);
   iree_hal_amdgpu_virtual_queue_t* queue = NULL;
   IREE_RETURN_IF_ERROR(iree_hal_amdgpu_logical_device_select_host_queue(
       logical_device, queue_affinity, &queue));
   return queue->vtable->host_call(queue, wait_semaphore_list,
                                   signal_semaphore_list, call, args, flags);
 }

 static iree_status_t iree_hal_amdgpu_logical_device_queue_dispatch(
     iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
     const iree_hal_semaphore_list_t wait_semaphore_list,
     const iree_hal_semaphore_list_t signal_semaphore_list,
     iree_hal_executable_t* executable,
     iree_hal_executable_export_ordinal_t export_ordinal,
     const iree_hal_dispatch_config_t config, iree_const_byte_span_t constants,
     const iree_hal_buffer_ref_list_t bindings,
     iree_hal_dispatch_flags_t flags) {
   iree_hal_amdgpu_logical_device_t* logical_device =
       iree_hal_amdgpu_logical_device_cast(base_device);
   iree_hal_amdgpu_virtual_queue_t* queue = NULL;
   IREE_RETURN_IF_ERROR(iree_hal_amdgpu_logical_device_select_host_queue(
       logical_device, queue_affinity, &queue));
   return queue->vtable->dispatch(
       queue, wait_semaphore_list, signal_semaphore_list, executable,
       export_ordinal, config, constants, bindings, flags);
 }

 static iree_status_t iree_hal_amdgpu_logical_device_queue_execute(
     iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity,
     const iree_hal_semaphore_list_t wait_semaphore_list,
     const iree_hal_semaphore_list_t signal_semaphore_list,
     iree_hal_command_buffer_t* command_buffer,
     iree_hal_buffer_binding_table_t binding_table,
     iree_hal_execute_flags_t flags) {
   iree_hal_amdgpu_logical_device_t* logical_device =
       iree_hal_amdgpu_logical_device_cast(base_device);
   iree_hal_amdgpu_virtual_queue_t* queue = NULL;
   IREE_RETURN_IF_ERROR(iree_hal_amdgpu_logical_device_select_host_queue(
       logical_device, queue_affinity, &queue));
   return queue->vtable->execute(queue, wait_semaphore_list,
                                 signal_semaphore_list, command_buffer,
                                 binding_table, flags);
 }

 static iree_status_t iree_hal_amdgpu_logical_device_queue_flush(
     iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity) {
   iree_hal_amdgpu_logical_device_t* logical_device =
       iree_hal_amdgpu_logical_device_cast(base_device);
   IREE_RETURN_IF_ERROR(iree_hal_amdgpu_queue_affinity_normalize(
       logical_device->queue_affinity_mask, queue_affinity, &queue_affinity));

   IREE_HAL_FOR_QUEUE_AFFINITY(queue_affinity) {
     iree_hal_amdgpu_virtual_queue_t* queue = NULL;
     IREE_RETURN_IF_ERROR(iree_hal_amdgpu_logical_device_queue_from_ordinal(
         logical_device, queue_ordinal, &queue));
     IREE_RETURN_IF_ERROR(queue->vtable->flush(queue));
   }
   return iree_ok_status();
 }

 static iree_status_t
 iree_hal_amdgpu_logical_device_verify_queue_device_profiling_supported(
     iree_hal_amdgpu_logical_device_t* logical_device) {
   for (iree_host_size_t i = 0; i < logical_device->physical_device_count; ++i) {
     iree_hal_amdgpu_physical_device_t* physical_device =
         logical_device->physical_devices[i];
     if (iree_hal_amdgpu_vendor_packet_capabilities_support_timestamp_range(
             physical_device->vendor_packet_capabilities)) {
       continue;
     }
     return iree_make_status(
         IREE_STATUS_FAILED_PRECONDITION,
         "AMDGPU queue operation profiling requires PM4 timestamp range "
         "support on physical device %" PRIhsz,
         physical_device->device_ordinal);
   }
   return iree_ok_status();
 }

 static iree_status_t iree_hal_amdgpu_logical_device_profiling_begin(
     iree_hal_device_t* base_device,
     const iree_hal_device_profiling_options_t* options) {
   iree_hal_amdgpu_logical_device_t* logical_device =
       iree_hal_amdgpu_logical_device_cast(base_device);
   iree_hal_device_profiling_options_t resolved_options =
       iree_hal_amdgpu_logical_device_resolve_profiling_options(options);

   if (iree_hal_device_profiling_options_requests_data(
           &resolved_options,
           IREE_HAL_DEVICE_PROFILING_DATA_HOST_EXECUTION_EVENTS)) {
     return iree_make_status(
         IREE_STATUS_UNIMPLEMENTED,
         "AMDGPU profiling does not produce host execution events");
   }
   if (resolved_options.data_families == IREE_HAL_DEVICE_PROFILING_DATA_NONE) {
     return iree_ok_status();
   }
   if (!logical_device->frontier_tracker) {
     return iree_make_status(
         IREE_STATUS_FAILED_PRECONDITION,
         "AMDGPU profiling requires an assigned device topology");
   }
   if (logical_device->profiling.options.data_families !=
       IREE_HAL_DEVICE_PROFILING_DATA_NONE) {
     return iree_make_status(IREE_STATUS_FAILED_PRECONDITION,
                             "cannot nest AMDGPU profile captures");
   }
   if (iree_hal_device_profiling_options_requests_data(
           &resolved_options,
           IREE_HAL_DEVICE_PROFILING_DATA_DEVICE_QUEUE_EVENTS)) {
     IREE_RETURN_IF_ERROR(
         iree_hal_amdgpu_logical_device_verify_queue_device_profiling_supported(
             logical_device));
   }

   bool sink_session_begun = false;
   bool hsa_profiling_enabled = false;
   bool counter_profiling_enabled = false;
   bool trace_profiling_enabled = false;
   iree_hal_device_profiling_options_t session_options = {0};
   iree_hal_device_profiling_options_storage_t* options_storage = NULL;
   iree_hal_amdgpu_profile_counter_session_t* counter_session = NULL;
   iree_hal_amdgpu_profile_trace_session_t* trace_session = NULL;
   iree_hal_amdgpu_profile_device_metrics_session_t* device_metrics_session =
       NULL;
   iree_status_t status = iree_hal_device_profiling_options_clone(
       &resolved_options, logical_device->host_allocator, &session_options,
       &options_storage);
   iree_hal_profile_sink_t* sink = session_options.sink;
   uint64_t session_id = 0;
   iree_hal_profile_chunk_metadata_t metadata = {0};
   if (iree_status_is_ok(status)) {
     session_id = logical_device->next_profile_session_id++;
     metadata = iree_hal_amdgpu_logical_device_profile_session_metadata(
         logical_device, session_id);
     logical_device->profiling.next_clock_correlation_sample_id = 1;
     memset(&logical_device->profiling.metadata_cursor, 0,
            sizeof(logical_device->profiling.metadata_cursor));
   }
   if (iree_status_is_ok(status)) {
     status = iree_hal_amdgpu_profile_counter_session_allocate(
         logical_device, &session_options, logical_device->host_allocator,
         &counter_session);
   }
   if (iree_status_is_ok(status)) {
     status = iree_hal_amdgpu_profile_trace_session_allocate(
         logical_device, &session_options, logical_device->host_allocator,
         &trace_session);
   }
   if (iree_status_is_ok(status)) {
     status = iree_hal_amdgpu_profile_device_metrics_session_allocate(
         logical_device, &session_options, logical_device->host_allocator,
         &device_metrics_session);
   }
   if (iree_status_is_ok(status)) {
     status = iree_hal_profile_sink_begin_session(sink, &metadata);
     sink_session_begun = iree_status_is_ok(status);
   }
   if (iree_status_is_ok(status) &&
       iree_hal_device_profiling_options_requests_data(
           &session_options, IREE_HAL_DEVICE_PROFILING_DATA_QUEUE_EVENTS)) {
     status = iree_hal_amdgpu_profile_event_streams_ensure_queue_storage(
         &logical_device->profiling.event_streams,
         IREE_HAL_AMDGPU_LOGICAL_DEVICE_PROFILE_QUEUE_EVENT_CAPACITY,
         logical_device->host_allocator);
     if (iree_status_is_ok(status)) {
       iree_hal_amdgpu_profile_event_streams_clear_queue(
           &logical_device->profiling.event_streams);
     }
   }
   if (iree_status_is_ok(status) &&
       iree_hal_device_profiling_options_requests_data(
           &session_options, IREE_HAL_DEVICE_PROFILING_DATA_MEMORY_EVENTS)) {
     status = iree_hal_amdgpu_profile_event_streams_ensure_memory_storage(
         &logical_device->profiling.event_streams,
         IREE_HAL_AMDGPU_LOGICAL_DEVICE_PROFILE_MEMORY_EVENT_CAPACITY,
         logical_device->host_allocator);
     if (iree_status_is_ok(status)) {
       iree_hal_amdgpu_profile_event_streams_clear_memory(
           &logical_device->profiling.event_streams);
     }
   }
   if (iree_status_is_ok(status)) {
     status = iree_hal_amdgpu_logical_device_write_profile_metadata(
         logical_device, sink, session_id, session_options.data_families);
   }
   if (iree_status_is_ok(status)) {
     status = iree_hal_amdgpu_profile_counter_session_write_metadata(
         counter_session, sink, session_id, logical_device->identifier);
   }
   if (iree_status_is_ok(status)) {
     status = iree_hal_amdgpu_profile_device_metrics_session_write_metadata(
         device_metrics_session, sink, session_id, logical_device->identifier);
   }
   if (iree_status_is_ok(status) &&
       iree_hal_amdgpu_logical_device_profiling_needs_hsa_timestamps(
           session_options.data_families)) {
     status = iree_hal_amdgpu_logical_device_set_hsa_profiling_enabled(
         logical_device, true);
     hsa_profiling_enabled = iree_status_is_ok(status);
   }
   if (iree_status_is_ok(status)) {
     status = iree_hal_amdgpu_logical_device_set_counter_profiling_enabled(
         logical_device, counter_session, true);
     counter_profiling_enabled = iree_status_is_ok(status);
   }
   if (iree_status_is_ok(status)) {
     status = iree_hal_amdgpu_logical_device_set_trace_profiling_enabled(
         logical_device, trace_session, true);
     trace_profiling_enabled = iree_status_is_ok(status);
   }

   if (iree_status_is_ok(status)) {
     logical_device->profiling.options = session_options;
     logical_device->profiling.options_storage = options_storage;
     logical_device->profiling.session_id = session_id;
     logical_device->profiling.counter_session = counter_session;
     logical_device->profiling.trace_session = trace_session;
     logical_device->profiling.device_metrics_session = device_metrics_session;
     iree_hal_amdgpu_logical_device_set_queue_profiling_enabled(
         logical_device,
         iree_hal_amdgpu_logical_device_queue_profile_flags(&session_options));
   } else {
     if (trace_profiling_enabled) {
       status = iree_status_join(
           status, iree_hal_amdgpu_logical_device_set_trace_profiling_enabled(
                       logical_device, trace_session, false));
     }
     if (counter_profiling_enabled) {
       status = iree_status_join(
           status, iree_hal_amdgpu_logical_device_set_counter_profiling_enabled(
                       logical_device, counter_session, false));
     }
     if (hsa_profiling_enabled) {
       status = iree_status_join(
           status, iree_hal_amdgpu_logical_device_set_hsa_profiling_enabled(
                       logical_device, false));
     }
     if (sink_session_begun) {
       status = iree_status_join(
           status, iree_hal_profile_sink_end_session(sink, &metadata,
                                                     iree_status_code(status)));
     }
     logical_device->profiling.next_clock_correlation_sample_id = 0;
     memset(&logical_device->profiling.metadata_cursor, 0,
            sizeof(logical_device->profiling.metadata_cursor));
     iree_hal_device_profiling_options_storage_free(
         options_storage, logical_device->host_allocator);
     iree_hal_amdgpu_profile_counter_session_free(counter_session);
     iree_hal_amdgpu_profile_trace_session_free(trace_session);
     iree_hal_amdgpu_profile_device_metrics_session_free(device_metrics_session);
   }
   return status;
 }

 static iree_status_t iree_hal_amdgpu_logical_device_profiling_flush(
     iree_hal_device_t* base_device) {
   iree_hal_amdgpu_logical_device_t* logical_device =
       iree_hal_amdgpu_logical_device_cast(base_device);

   const iree_hal_device_profiling_options_t* options =
       &logical_device->profiling.options;
   if (options->data_families == IREE_HAL_DEVICE_PROFILING_DATA_NONE) {
     return iree_ok_status();
   }
   iree_hal_profile_sink_t* sink = options->sink;
   const bool emit_executable_artifacts =
       iree_hal_amdgpu_logical_device_profile_needs_executable_artifacts(
           options->data_families);
   IREE_RETURN_IF_ERROR(iree_hal_amdgpu_profile_metadata_write(
       &logical_device->profile_metadata, sink,
       logical_device->profiling.session_id, logical_device->identifier,
       emit_executable_artifacts, &logical_device->profiling.metadata_cursor));
   IREE_RETURN_IF_ERROR(iree_hal_amdgpu_logical_device_write_profile_events(
       logical_device, sink, logical_device->profiling.session_id));
   IREE_RETURN_IF_ERROR(
       iree_hal_amdgpu_logical_device_write_profile_clock_correlations(
           logical_device, sink, logical_device->profiling.session_id));
   return iree_hal_amdgpu_profile_device_metrics_session_sample_and_write(
       logical_device->profiling.device_metrics_session, sink,
       logical_device->profiling.session_id, logical_device->identifier);
 }

 static iree_status_t iree_hal_amdgpu_logical_device_profiling_end(
     iree_hal_device_t* base_device) {
   iree_hal_amdgpu_logical_device_t* logical_device =
       iree_hal_amdgpu_logical_device_cast(base_device);

   iree_status_t status = iree_ok_status();
   const iree_hal_device_profiling_data_families_t data_families =
       logical_device->profiling.options.data_families;
   if (data_families == IREE_HAL_DEVICE_PROFILING_DATA_NONE) {
     return iree_ok_status();
   }

   iree_hal_profile_sink_t* sink = logical_device->profiling.options.sink;
   iree_hal_amdgpu_profile_counter_session_t* counter_session =
       logical_device->profiling.counter_session;
   iree_hal_amdgpu_profile_trace_session_t* trace_session =
       logical_device->profiling.trace_session;
   iree_hal_amdgpu_profile_device_metrics_session_t* device_metrics_session =
       logical_device->profiling.device_metrics_session;
   const uint64_t session_id = logical_device->profiling.session_id;
   iree_hal_profile_chunk_metadata_t metadata =
       iree_hal_amdgpu_logical_device_profile_session_metadata(logical_device,
                                                               session_id);
   const bool emit_executable_artifacts =
       iree_hal_amdgpu_logical_device_profile_needs_executable_artifacts(
           data_families);

   status = iree_hal_amdgpu_profile_metadata_write(
       &logical_device->profile_metadata, sink, session_id,
       logical_device->identifier, emit_executable_artifacts,
       &logical_device->profiling.metadata_cursor);
   if (iree_status_is_ok(status)) {
     status = iree_hal_amdgpu_logical_device_write_profile_events(
         logical_device, sink, session_id);
   }
   if (iree_status_is_ok(status)) {
     status = iree_hal_amdgpu_logical_device_write_profile_clock_correlations(
         logical_device, sink, session_id);
   }
   if (iree_status_is_ok(status)) {
     status = iree_hal_amdgpu_profile_device_metrics_session_sample_and_write(
         device_metrics_session, sink, session_id, logical_device->identifier);
   }
   status = iree_status_join(
       status, iree_hal_amdgpu_logical_device_set_trace_profiling_enabled(
                   logical_device, trace_session, false));
   status = iree_status_join(
       status, iree_hal_amdgpu_logical_device_set_counter_profiling_enabled(
                   logical_device, counter_session, false));
   if (iree_hal_amdgpu_logical_device_profiling_needs_hsa_timestamps(
           data_families)) {
     status = iree_status_join(
         status, iree_hal_amdgpu_logical_device_set_hsa_profiling_enabled(
                     logical_device, false));
   }
   status =
       iree_status_join(status, iree_hal_profile_sink_end_session(
                                    sink, &metadata, iree_status_code(status)));

   iree_hal_amdgpu_logical_device_reset_profile_options(logical_device);
   logical_device->profiling.session_id = 0;
   logical_device->profiling.next_clock_correlation_sample_id = 0;
   memset(&logical_device->profiling.metadata_cursor, 0,
          sizeof(logical_device->profiling.metadata_cursor));
   logical_device->profiling.counter_session = NULL;
   logical_device->profiling.trace_session = NULL;
   logical_device->profiling.device_metrics_session = NULL;
   iree_hal_amdgpu_logical_device_set_queue_profiling_enabled(
       logical_device, IREE_HAL_AMDGPU_HOST_QUEUE_PROFILE_FLAG_NONE);
   iree_hal_amdgpu_profile_counter_session_free(counter_session);
   iree_hal_amdgpu_profile_trace_session_free(trace_session);
   iree_hal_amdgpu_profile_device_metrics_session_free(device_metrics_session);
   return status;
 }

 static iree_status_t iree_hal_amdgpu_logical_device_external_capture_begin(
     iree_hal_device_t* base_device,
     const iree_hal_device_external_capture_options_t* options) {
   (void)base_device;
   (void)options;
   return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
                           "AMDGPU external capture not implemented");
 }

 static iree_status_t iree_hal_amdgpu_logical_device_external_capture_end(
     iree_hal_device_t* base_device) {
   (void)base_device;
   return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
                           "AMDGPU external capture not implemented");
 }

 static const iree_hal_device_vtable_t iree_hal_amdgpu_logical_device_vtable = {
     .destroy = iree_hal_amdgpu_logical_device_destroy,
     .id = iree_hal_amdgpu_logical_device_id,
     .host_allocator = iree_hal_amdgpu_logical_device_host_allocator,
     .device_allocator = iree_hal_amdgpu_logical_device_allocator,
     .replace_device_allocator = iree_hal_amdgpu_replace_device_allocator,
     .replace_channel_provider = iree_hal_amdgpu_replace_channel_provider,
     .trim = iree_hal_amdgpu_logical_device_trim,
     .query_i64 = iree_hal_amdgpu_logical_device_query_i64,
     .query_capabilities = iree_hal_amdgpu_logical_device_query_capabilities,
     .topology_info = iree_hal_amdgpu_logical_device_topology_info,
     .refine_topology_edge = iree_hal_amdgpu_logical_device_refine_topology_edge,
     .assign_topology_info = iree_hal_amdgpu_logical_device_assign_topology_info,
     .create_channel = iree_hal_amdgpu_logical_device_create_channel,
     .create_command_buffer =
         iree_hal_amdgpu_logical_device_create_command_buffer,
     .create_event = iree_hal_amdgpu_logical_device_create_event,
     .create_executable_cache =
         iree_hal_amdgpu_logical_device_create_executable_cache,
     .import_file = iree_hal_amdgpu_logical_device_import_file,
     .create_semaphore = iree_hal_amdgpu_logical_device_create_semaphore,
     .query_semaphore_compatibility =
         iree_hal_amdgpu_logical_device_query_semaphore_compatibility,
     .query_queue_pool_backend =
         iree_hal_amdgpu_logical_device_query_queue_pool_backend,
     .queue_alloca = iree_hal_amdgpu_logical_device_queue_alloca,
     .queue_dealloca = iree_hal_amdgpu_logical_device_queue_dealloca,
     .queue_fill = iree_hal_amdgpu_logical_device_queue_fill,
     .queue_update = iree_hal_amdgpu_logical_device_queue_update,
     .queue_copy = iree_hal_amdgpu_logical_device_queue_copy,
     .queue_read = iree_hal_amdgpu_logical_device_queue_read,
     .queue_write = iree_hal_amdgpu_logical_device_queue_write,
     .queue_host_call = iree_hal_amdgpu_logical_device_queue_host_call,
     .queue_dispatch = iree_hal_amdgpu_logical_device_queue_dispatch,
     .queue_execute = iree_hal_amdgpu_logical_device_queue_execute,
     .queue_flush = iree_hal_amdgpu_logical_device_queue_flush,
     .profiling_begin = iree_hal_amdgpu_logical_device_profiling_begin,
     .profiling_flush = iree_hal_amdgpu_logical_device_profiling_flush,
     .profiling_end = iree_hal_amdgpu_logical_device_profiling_end,
     .external_capture_begin =
         iree_hal_amdgpu_logical_device_external_capture_begin,
     .external_capture_end = iree_hal_amdgpu_logical_device_external_capture_end,
 };