runtime/src/iree/hal/drivers/amdgpu/executable.c - 3p/openxla/iree - Git at Google

 // Copyright 2025 The IREE Authors
 //
 // Licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 #include "iree/hal/drivers/amdgpu/executable.h"

 #include "iree/base/internal/debugging.h"
 #include "iree/hal/drivers/amdgpu/buffer.h"
 #include "iree/hal/drivers/amdgpu/queue_affinity.h"
 #include "iree/hal/drivers/amdgpu/util/code_object_target.h"
 #include "iree/hal/drivers/amdgpu/util/hsaco_metadata.h"
 #include "iree/hal/drivers/amdgpu/util/kernarg_ring.h"
 #include "iree/hal/drivers/amdgpu/util/target_id.h"
 #include "iree/hal/drivers/amdgpu/util/topology.h"
 #include "iree/hal/drivers/amdgpu/util/vmem.h"
 #include "iree/hal/utils/elf_format.h"
 #include "iree/hal/utils/executable_debug_info.h"
 #include "iree/hal/utils/executable_header.h"

 // flatcc schemas:
 #include "iree/base/internal/flatcc/parsing.h"
 #include "iree/schemas/amdgpu_executable_def_reader.h"
 #include "iree/schemas/amdgpu_executable_def_verifier.h"

 //===----------------------------------------------------------------------===//
 // ISA Support
 //===----------------------------------------------------------------------===//

 typedef struct iree_hal_amdgpu_agent_available_isas_t {
   // Number of valid entries in |values|.
   iree_host_size_t count;
   // Fixed-capacity ISA list populated by HSA iteration callbacks.
   hsa_isa_t values[32];
 } iree_hal_amdgpu_agent_available_isas_t;

 typedef struct iree_hal_amdgpu_agent_isa_target_t {
   // HSA ISA handle the target identity was queried from.
   hsa_isa_t isa;
   // NUL-terminated HSA ISA name storage.
   char name_buffer[64 + /*NUL*/ 1];
   // Borrowed view into |name_buffer| excluding the NUL terminator.
   iree_string_view_t name;
   // Parsed target identity borrowing processor text from |name_buffer|.
   iree_hal_amdgpu_target_id_t target_id;
 } iree_hal_amdgpu_agent_isa_target_t;

 static hsa_status_t iree_hal_amdgpu_iterate_agent_isa(hsa_isa_t isa,
                                                       void* user_data) {
   iree_hal_amdgpu_agent_available_isas_t* isas =
       (iree_hal_amdgpu_agent_available_isas_t*)user_data;
   if (isas->count >= IREE_ARRAYSIZE(isas->values)) {
     return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
   }
   isas->values[isas->count++] = isa;
   return HSA_STATUS_SUCCESS;
 }

 static iree_status_t iree_hal_amdgpu_query_agent_available_isas(
     const iree_hal_amdgpu_libhsa_t* libhsa, hsa_agent_t device_agent,
     iree_hal_amdgpu_agent_available_isas_t* out_available_isas) {
   memset(out_available_isas, 0, sizeof(*out_available_isas));
   return iree_hsa_agent_iterate_isas(IREE_LIBHSA(libhsa), device_agent,
                                      iree_hal_amdgpu_iterate_agent_isa,
                                      out_available_isas);
 }

 static iree_status_t iree_hal_amdgpu_query_agent_isa_target(
     const iree_hal_amdgpu_libhsa_t* libhsa, hsa_isa_t isa,
     iree_hal_amdgpu_agent_isa_target_t* out_isa_target) {
   memset(out_isa_target, 0, sizeof(*out_isa_target));
   out_isa_target->isa = isa;

   uint32_t name_length = 0;
   IREE_RETURN_IF_ERROR(iree_hsa_isa_get_info_alt(
       IREE_LIBHSA(libhsa), isa, HSA_ISA_INFO_NAME_LENGTH, &name_length));
   if (name_length == 0 ||
       name_length > IREE_ARRAYSIZE(out_isa_target->name_buffer)) {
     return iree_make_status(IREE_STATUS_RESOURCE_EXHAUSTED,
                             "ISA name invalid (empty or too long: %u)",
                             name_length);
   }

   IREE_RETURN_IF_ERROR(iree_hsa_isa_get_info_alt(IREE_LIBHSA(libhsa), isa,
                                                  HSA_ISA_INFO_NAME,
                                                  out_isa_target->name_buffer));
   out_isa_target->name = iree_make_string_view(out_isa_target->name_buffer,
                                                name_length - /*NUL*/ 1);
   return iree_hal_amdgpu_target_id_parse_hsa_isa_name(
       out_isa_target->name, &out_isa_target->target_id);
 }

 static iree_status_t iree_hal_amdgpu_verify_isas_equal(
     const iree_hal_amdgpu_libhsa_t* libhsa, iree_host_size_t agent_a_ordinal,
     hsa_isa_t isa_a, iree_host_size_t agent_b_ordinal, hsa_isa_t isa_b,
     iree_host_size_t isa_ordinal) {
   iree_hal_amdgpu_agent_isa_target_t target_a;
   IREE_RETURN_IF_ERROR(
       iree_hal_amdgpu_query_agent_isa_target(libhsa, isa_a, &target_a));
   iree_hal_amdgpu_agent_isa_target_t target_b;
   IREE_RETURN_IF_ERROR(
       iree_hal_amdgpu_query_agent_isa_target(libhsa, isa_b, &target_b));

   iree_hal_amdgpu_target_compatibility_t mismatch =
       IREE_HAL_AMDGPU_TARGET_COMPATIBILITY_COMPATIBLE;
   if (target_a.target_id.kind != target_b.target_id.kind ||
       target_a.target_id.version.major != target_b.target_id.version.major ||
       target_a.target_id.version.minor != target_b.target_id.version.minor ||
       target_a.target_id.version.stepping !=
           target_b.target_id.version.stepping) {
     mismatch |= IREE_HAL_AMDGPU_TARGET_COMPATIBILITY_MISMATCH_PROCESSOR;
   }
   if ((target_a.target_id.kind == IREE_HAL_AMDGPU_TARGET_KIND_GENERIC ||
        target_b.target_id.kind == IREE_HAL_AMDGPU_TARGET_KIND_GENERIC) &&
       !iree_string_view_equal(target_a.target_id.processor,
                               target_b.target_id.processor)) {
     mismatch |= IREE_HAL_AMDGPU_TARGET_COMPATIBILITY_MISMATCH_GENERIC_FAMILY;
   }
   if (target_a.target_id.generic_version !=
       target_b.target_id.generic_version) {
     mismatch |= IREE_HAL_AMDGPU_TARGET_COMPATIBILITY_MISMATCH_GENERIC_VERSION;
   }
   if (target_a.target_id.sramecc != target_b.target_id.sramecc) {
     mismatch |= IREE_HAL_AMDGPU_TARGET_COMPATIBILITY_MISMATCH_SRAMECC;
   }
   if (target_a.target_id.xnack != target_b.target_id.xnack) {
     mismatch |= IREE_HAL_AMDGPU_TARGET_COMPATIBILITY_MISMATCH_XNACK;
   }
   if (mismatch != IREE_HAL_AMDGPU_TARGET_COMPATIBILITY_COMPATIBLE) {
     char target_a_string[128] = {0};
     IREE_RETURN_IF_ERROR(iree_hal_amdgpu_target_id_format(
         &target_a.target_id, sizeof(target_a_string), target_a_string,
         /*out_buffer_length=*/NULL));
     char target_b_string[128] = {0};
     IREE_RETURN_IF_ERROR(iree_hal_amdgpu_target_id_format(
         &target_b.target_id, sizeof(target_b_string), target_b_string,
         /*out_buffer_length=*/NULL));
     char mismatch_reasons[128] = {0};
     IREE_RETURN_IF_ERROR(iree_hal_amdgpu_target_compatibility_format(
         mismatch, sizeof(mismatch_reasons), mismatch_reasons,
         /*out_buffer_length=*/NULL));
     return iree_make_status(IREE_STATUS_FAILED_PRECONDITION,
                             "GPU agent[%" PRIhsz "] ISA[%" PRIhsz
                             "] target `%s` does not match GPU agent[%" PRIhsz
                             "] ISA[%" PRIhsz "] target `%s` (mismatched %s)",
                             agent_a_ordinal, isa_ordinal, target_a_string,
                             agent_b_ordinal, isa_ordinal, target_b_string,
                             mismatch_reasons);
   }
   return iree_ok_status();
 }

 iree_status_t iree_hal_amdgpu_verify_device_isa_commonality(
     const iree_hal_amdgpu_libhsa_t* libhsa,
     const iree_hal_amdgpu_topology_t* topology) {
   // If only one agent then no need to check.
   if (topology->gpu_agent_count == 1) return iree_ok_status();

   IREE_TRACE_ZONE_BEGIN(z0);

   // Query all available ISAs supported by the first GPU agent.
   // We'll use this to compare with all other GPU agents.
   iree_hal_amdgpu_agent_available_isas_t expected_isas;
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
       z0, iree_hal_amdgpu_query_agent_available_isas(
               libhsa, topology->gpu_agents[0], &expected_isas));

   // For all subsequent GPU agents ensure their ISAs match.
   for (iree_host_size_t i = 1; i < topology->gpu_agent_count; ++i) {
     // Get ISAs supported by this agent.
     iree_hal_amdgpu_agent_available_isas_t available_isas;
     IREE_RETURN_AND_END_ZONE_IF_ERROR(
         z0, iree_hal_amdgpu_query_agent_available_isas(
                 libhsa, topology->gpu_agents[i], &available_isas));

     // Ensure ISAs match.
     // We could be less strict here and require only one matching ISA that we
     // share for all devices but in practice today all devices have a single
     // supported ISA and we expect devices to match exactly so this is just for
     // being thorough.
     if (available_isas.count != expected_isas.count) {
       IREE_RETURN_AND_END_ZONE_IF_ERROR(
           z0, iree_make_status(IREE_STATUS_FAILED_PRECONDITION,
                                "runtime currently expects all GPU agents must "
                                "support the same ISAs; gpu_agents[%" PRIhsz
                                "] does not match gpu_agents[0]",
                                i));
     }
     for (iree_host_size_t j = 0; j < expected_isas.count; ++j) {
       IREE_RETURN_AND_END_ZONE_IF_ERROR(
           z0, iree_hal_amdgpu_verify_isas_equal(
                   libhsa, /*agent_a_ordinal=*/0, expected_isas.values[j],
                   /*agent_b_ordinal=*/i, available_isas.values[j],
                   /*isa_ordinal=*/j));
     }
   }

   IREE_TRACE_ZONE_END(z0);
   return iree_ok_status();
 }

 iree_status_t iree_hal_amdgpu_executable_format_supported(
     const iree_hal_amdgpu_libhsa_t* libhsa, hsa_agent_t device_agent,
     iree_string_view_t format, bool* out_supported, hsa_isa_t* out_isa) {
   IREE_ASSERT_ARGUMENT(out_supported);
   *out_supported = false;
   if (out_isa) out_isa->handle = 0;

   if (!iree_string_view_starts_with(format, IREE_SV("gfx")) &&
       !iree_string_view_starts_with(format, IREE_SV("amdgcn-amd-amdhsa--"))) {
     return iree_ok_status();
   }

   iree_hal_amdgpu_target_id_t format_target_id;
   IREE_RETURN_IF_ERROR(iree_hal_amdgpu_target_id_parse(
       format,
       IREE_HAL_AMDGPU_TARGET_ID_PARSE_FLAG_ALLOW_HSA_PREFIX |
           IREE_HAL_AMDGPU_TARGET_ID_PARSE_FLAG_ALLOW_ARCH_ONLY |
           IREE_HAL_AMDGPU_TARGET_ID_PARSE_FLAG_ALLOW_FEATURE_SUFFIXES,
       &format_target_id));

   // Query all available ISAs supported by any GPU agent.
   // This list is ordered by descending priority.
   iree_hal_amdgpu_agent_available_isas_t available_isas;
   IREE_RETURN_IF_ERROR(iree_hal_amdgpu_query_agent_available_isas(
       libhsa, device_agent, &available_isas));

   for (iree_host_size_t i = 0; i < available_isas.count; ++i) {
     iree_hal_amdgpu_agent_isa_target_t isa_target;
     IREE_RETURN_IF_ERROR(iree_hal_amdgpu_query_agent_isa_target(
         libhsa, available_isas.values[i], &isa_target));
     if (iree_hal_amdgpu_target_id_check_compatible(&format_target_id,
                                                    &isa_target.target_id) ==
         IREE_HAL_AMDGPU_TARGET_COMPATIBILITY_COMPATIBLE) {
       *out_supported = true;
       if (out_isa) *out_isa = isa_target.isa;
       return iree_ok_status();
     }
   }

   // No compatible ISAs found.
   *out_supported = false;
   if (out_isa) out_isa->handle = 0;
   return iree_ok_status();
 }

 //===----------------------------------------------------------------------===//
 // Executable Verification
 //===----------------------------------------------------------------------===//

 typedef struct iree_hal_amdgpu_device_limits_t {
   // Maximum total workgroup size from HSA_ISA_INFO_WORKGROUP_MAX_SIZE.
   uint32_t max_workgroup_size;
   // Maximum workgroup size per dimension from HSA_ISA_INFO_WORKGROUP_MAX_DIM.
   uint16_t max_workgroup_size_per_dim[3];
 } iree_hal_amdgpu_device_limits_t;
 static iree_status_t iree_hal_amdgpu_query_device_limits(
     const iree_hal_amdgpu_libhsa_t* libhsa, hsa_agent_t device_agent,
     hsa_isa_t isa, iree_hal_amdgpu_device_limits_t* out_limits) {
   IREE_TRACE_ZONE_BEGIN(z0);
   memset(out_limits, 0, sizeof(*out_limits));

   IREE_RETURN_AND_END_ZONE_IF_ERROR(
       z0, iree_hsa_isa_get_info_alt(IREE_LIBHSA(libhsa), isa,
                                     HSA_ISA_INFO_WORKGROUP_MAX_SIZE,
                                     &out_limits->max_workgroup_size));
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
       z0, iree_hsa_isa_get_info_alt(IREE_LIBHSA(libhsa), isa,
                                     HSA_ISA_INFO_WORKGROUP_MAX_DIM,
                                     &out_limits->max_workgroup_size_per_dim));

   IREE_TRACE_ZONE_END(z0);
   return iree_ok_status();
 }

 static bool iree_hal_amdgpu_executable_data_is_wrapped_flatbuffer(
     iree_const_byte_span_t executable_data) {
   if (executable_data.data_length != 0 &&
       executable_data.data_length < sizeof(uint32_t)) {
     return false;
   }
   iree_const_byte_span_t identifier_data =
       iree_make_const_byte_span(executable_data.data, sizeof(uint32_t));
   if (iree_const_byte_span_is_empty(identifier_data)) {
     return false;
   }
   return memcmp(identifier_data.data,
                 iree_hal_amdgpu_ExecutableDef_file_identifier,
                 identifier_data.data_length) == 0;
 }

 static iree_status_t iree_hal_amdgpu_executable_get_single_module_image(
     iree_hal_amdgpu_ModuleDef_vec_t module_defs,
     iree_const_byte_span_t* out_code_object_data) {
   *out_code_object_data = iree_const_byte_span_empty();

   // Today we require a single module. We could support multiple and link them
   // together by loading code objects in the order specified. This could be
   // useful if we ever made our own fat binaries or wanted to reuse shared ELFs
   // across multiple executables by having them reference the same ranges in a
   // larger file.
   const iree_host_size_t module_count =
       iree_hal_amdgpu_ModuleDef_vec_len(module_defs);
   if (module_count != 1) {
     return iree_make_status(IREE_STATUS_UNIMPLEMENTED,
                             "only a single ModuleDef per ExecutableDef is "
                             "supported; executable declares %" PRIhsz
                             " modules",
                             module_count);
   }
   iree_hal_amdgpu_ModuleDef_table_t module_def =
       iree_hal_amdgpu_ModuleDef_vec_at(module_defs, 0);
   if (!module_def) {
     return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, "module is NULL");
   }
   flatbuffers_string_t image = iree_hal_amdgpu_ModuleDef_image_get(module_def);
   if (!image) {
     return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
                             "module image is empty");
   }
   const iree_host_size_t image_size = flatbuffers_string_len(image);
   if (image_size == 0) {
     return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
                             "module image is empty");
   }
   *out_code_object_data =
       iree_make_const_byte_span((const uint8_t*)image, image_size);
   return iree_ok_status();
 }

 static iree_status_t iree_hal_amdgpu_executable_format_from_target_id(
     const iree_hal_amdgpu_target_id_t* target_id,
     iree_host_size_t executable_format_capacity, char* executable_format) {
   return iree_hal_amdgpu_target_id_format(target_id, executable_format_capacity,
                                           executable_format,
                                           /*out_buffer_length=*/NULL);
 }

 iree_status_t iree_hal_amdgpu_executable_infer_format(
     iree_const_byte_span_t executable_data,
     iree_host_size_t executable_format_capacity, char* executable_format,
     iree_allocator_t host_allocator, iree_host_size_t* out_inferred_size) {
   (void)host_allocator;
   const bool is_wrapped_flatbuffer =
       iree_hal_amdgpu_executable_data_is_wrapped_flatbuffer(executable_data);

   // Read the header prefix (with unsafe inference if size is unknown).
   const bool unsafe_infer_size = (executable_data.data_length == 0);
   iree_const_byte_span_t flatbuffer_data = iree_const_byte_span_empty();
   if (is_wrapped_flatbuffer) {
     IREE_RETURN_IF_ERROR(iree_hal_read_executable_flatbuffer_header(
         executable_data, unsafe_infer_size,
         iree_hal_amdgpu_ExecutableDef_file_identifier, &flatbuffer_data));

     // Verify the flatbuffer structure.
     const int verify_ret = iree_hal_amdgpu_ExecutableDef_verify_as_root(
         flatbuffer_data.data, flatbuffer_data.data_length);
     if (verify_ret != flatcc_verify_ok) {
       return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
                               "flatbuffer verification failed: %s",
                               flatcc_verify_error_string(verify_ret));
     }

     iree_hal_amdgpu_ExecutableDef_table_t executable_def =
         iree_hal_amdgpu_ExecutableDef_as_root(flatbuffer_data.data);
     iree_const_byte_span_t code_object_data = iree_const_byte_span_empty();
     IREE_RETURN_IF_ERROR(iree_hal_amdgpu_executable_get_single_module_image(
         iree_hal_amdgpu_ExecutableDef_modules_get(executable_def),
         &code_object_data));

     iree_hal_amdgpu_target_id_t code_object_target_id;
     IREE_RETURN_IF_ERROR(iree_hal_amdgpu_code_object_target_id_from_elf(
         code_object_data, &code_object_target_id));
     IREE_RETURN_IF_ERROR(iree_hal_amdgpu_executable_format_from_target_id(
         &code_object_target_id, executable_format_capacity, executable_format));

     // Return the total size (header + flatbuffer).
     *out_inferred_size =
         sizeof(iree_flatbuffer_file_header_t) + flatbuffer_data.data_length;
     return iree_ok_status();
   } else {
     iree_const_byte_span_t hsaco_data = executable_data;
     if (unsafe_infer_size) {
       iree_host_size_t hsaco_size = 0;
       IREE_RETURN_IF_ERROR(iree_hal_elf_calculate_size(hsaco_data, &hsaco_size),
                            "calculating raw HSACO ELF size");
       hsaco_data = iree_make_const_byte_span(executable_data.data, hsaco_size);
     }

     iree_hal_amdgpu_target_id_t code_object_target_id;
     IREE_RETURN_IF_ERROR(iree_hal_amdgpu_code_object_target_id_from_elf(
         hsaco_data, &code_object_target_id));
     IREE_RETURN_IF_ERROR(iree_hal_amdgpu_executable_format_from_target_id(
         &code_object_target_id, executable_format_capacity, executable_format));

     *out_inferred_size = hsaco_data.data_length;
     return iree_ok_status();
   }
 }

 // Verifies the structure of the flatbuffer.
 //
 // There are still some conditions we must be aware of (such as omitted names on
 // functions with internal linkage), however we shouldn't need to bounds check
 // anything within the flatbuffer after this succeeds.
 static iree_status_t iree_hal_amdgpu_executable_flatbuffer_verify(
     iree_const_byte_span_t flatbuffer_data,
     const iree_hal_amdgpu_device_limits_t* limits) {
   IREE_ASSERT(flatbuffer_data.data && flatbuffer_data.data_length >= 16);

   // Run flatcc generated verification. This ensures all pointers are in-bounds
   // and that we can safely walk the file, but not that the actual contents of
   // the flatbuffer meet our expectations.
   const int verify_ret = iree_hal_amdgpu_ExecutableDef_verify_as_root(
       flatbuffer_data.data, flatbuffer_data.data_length);
   if (verify_ret != flatcc_verify_ok) {
     return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
                             "flatbuffer verification failed: %s",
                             flatcc_verify_error_string(verify_ret));
   }

   iree_hal_amdgpu_ExecutableDef_table_t executable_def =
       iree_hal_amdgpu_ExecutableDef_as_root(flatbuffer_data.data);

   iree_hal_amdgpu_ModuleDef_vec_t modules_vec =
       iree_hal_amdgpu_ExecutableDef_modules_get(executable_def);
   const iree_host_size_t module_count =
       iree_hal_amdgpu_ModuleDef_vec_len(modules_vec);
   for (iree_host_size_t i = 0; i < module_count; ++i) {
     iree_hal_amdgpu_ModuleDef_table_t module_def =
         iree_hal_amdgpu_ModuleDef_vec_at(modules_vec, i);
     if (!module_def) {
       return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
                               "modules[%" PRIhsz "] is NULL", i);
     }
     if (flatbuffers_string_len(
             iree_hal_amdgpu_ModuleDef_image_get(module_def)) == 0) {
       return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
                               "modules[%" PRIhsz "] contents are empty", i);
     }
   }

   iree_hal_amdgpu_ExportDef_vec_t exports_vec =
       iree_hal_amdgpu_ExecutableDef_exports_get(executable_def);
   const iree_host_size_t export_count =
       iree_hal_amdgpu_ExportDef_vec_len(exports_vec);
   for (iree_host_size_t i = 0; i < export_count; ++i) {
     iree_hal_amdgpu_ExportDef_table_t export_def =
         iree_hal_amdgpu_ExportDef_vec_at(exports_vec, i);
     if (!export_def) continue;

     if (flatbuffers_string_len(
             iree_hal_amdgpu_ExportDef_symbol_name_get(export_def)) == 0) {
       return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
                               "exports[%" PRIhsz "] name is empty", i);
     }

     if (iree_hal_amdgpu_ExportDef_workgroup_size_is_present(export_def)) {
       const iree_hal_amdgpu_Dims_struct_t workgroup_size =
           iree_hal_amdgpu_ExportDef_workgroup_size_get(export_def);
       if (workgroup_size->x > limits->max_workgroup_size_per_dim[0] ||
           workgroup_size->y > limits->max_workgroup_size_per_dim[1] ||
           workgroup_size->z > limits->max_workgroup_size_per_dim[2]) {
         return iree_make_status(
             IREE_STATUS_INVALID_ARGUMENT,
             "exports[%" PRIhsz
             "] workgroup size dims %ux%ux%u exceeds device maximum %ux%ux%u",
             i, workgroup_size->x, workgroup_size->y, workgroup_size->z,
             limits->max_workgroup_size_per_dim[0],
             limits->max_workgroup_size_per_dim[1],
             limits->max_workgroup_size_per_dim[2]);
       }
       const uint64_t total_workgroup_size =
           (uint64_t)workgroup_size->x * workgroup_size->y * workgroup_size->z;
       if (total_workgroup_size > limits->max_workgroup_size) {
         return iree_make_status(
             IREE_STATUS_INVALID_ARGUMENT,
             "exports[%" PRIhsz "] workgroup size total %" PRIu64
             " exceeds device maximum %u",
             i, total_workgroup_size, limits->max_workgroup_size);
       }
     } else {
       return iree_make_status(
           IREE_STATUS_INVALID_ARGUMENT,
           "exports[%" PRIhsz "] workgroup size dims are missing", i);
     }

     const uint32_t constant_count =
         iree_hal_amdgpu_ExportDef_constant_count_get(export_def);
     if (constant_count > IREE_HAL_AMDGPU_MAX_DISPATCH_CONSTANT_COUNT) {
       return iree_make_status(
           IREE_STATUS_INVALID_ARGUMENT,
           "exports[%" PRIhsz "] constant_count %u exceeds maximum of %u", i,
           constant_count, IREE_HAL_AMDGPU_MAX_DISPATCH_CONSTANT_COUNT);
     }

     iree_hal_amdgpu_BindingBits_vec_t binding_flags_vec =
         iree_hal_amdgpu_ExportDef_binding_flags_get(export_def);
     if (iree_hal_amdgpu_BindingBits_vec_len(binding_flags_vec) >
         IREE_HAL_AMDGPU_MAX_DISPATCH_BINDING_COUNT) {
       return iree_make_status(
           IREE_STATUS_INVALID_ARGUMENT,
           "exports[%" PRIhsz "] binding_flags count %zu exceeds maximum of %u",
           i, iree_hal_amdgpu_BindingBits_vec_len(binding_flags_vec),
           IREE_HAL_AMDGPU_MAX_DISPATCH_BINDING_COUNT);
     }

     IREE_RETURN_IF_ERROR(iree_hal_debug_verify_export_def(
         iree_hal_amdgpu_ExportDef_debug_info_get(export_def)));
   }

   return iree_ok_status();
 }

 //===----------------------------------------------------------------------===//
 // Executable Loading
 //===----------------------------------------------------------------------===//

 static bool iree_hal_amdgpu_physical_device_mask_contains(
     uint64_t physical_device_mask, iree_host_size_t physical_device_ordinal) {
   return physical_device_ordinal < IREE_HAL_MAX_QUEUES &&
          iree_all_bits_set(physical_device_mask,
                            ((uint64_t)1) << physical_device_ordinal);
 }

 static iree_status_t iree_hal_amdgpu_executable_select_physical_devices(
     const iree_hal_amdgpu_topology_t* topology,
     iree_hal_queue_affinity_t requested_affinity,
     iree_hal_amdgpu_queue_affinity_physical_device_set_t*
         out_physical_devices) {
   memset(out_physical_devices, 0, sizeof(*out_physical_devices));

   iree_hal_amdgpu_queue_affinity_domain_t queue_affinity_domain = {
       .supported_affinity = 0,
       .physical_device_count = topology->gpu_agent_count,
       .queue_count_per_physical_device = topology->gpu_agent_queue_count,
   };

   for (iree_host_size_t physical_device_ordinal = 0;
        physical_device_ordinal < topology->gpu_agent_count;
        ++physical_device_ordinal) {
     iree_hal_queue_affinity_t physical_device_affinity = 0;
     IREE_RETURN_IF_ERROR(iree_hal_amdgpu_queue_affinity_for_physical_device(
         queue_affinity_domain, physical_device_ordinal,
         &physical_device_affinity));
     iree_hal_queue_affinity_or_into(queue_affinity_domain.supported_affinity,
                                     physical_device_affinity);
   }

   return iree_hal_amdgpu_queue_affinity_select_physical_devices(
       queue_affinity_domain, requested_affinity, out_physical_devices);
 }

 static iree_status_t iree_hal_amdgpu_executable_format_target_id_for_message(
     const iree_hal_amdgpu_target_id_t* target_id,
     iree_host_size_t buffer_capacity, char* buffer) {
   return iree_hal_amdgpu_target_id_format(target_id, buffer_capacity, buffer,
                                           /*out_buffer_length=*/NULL);
 }

 static iree_status_t iree_hal_amdgpu_executable_preflight_agent_code_object(
     const iree_hal_amdgpu_libhsa_t* libhsa, hsa_agent_t device_agent,
     iree_host_size_t physical_device_ordinal,
     const iree_hal_amdgpu_target_id_t* code_object_target_id) {
   iree_hal_amdgpu_agent_available_isas_t available_isas;
   IREE_RETURN_IF_ERROR(iree_hal_amdgpu_query_agent_available_isas(
       libhsa, device_agent, &available_isas));
   if (available_isas.count == 0) {
     return iree_make_status(IREE_STATUS_INCOMPATIBLE,
                             "GPU agent[%" PRIhsz
                             "] reports no AMDGPU ISA targets",
                             physical_device_ordinal);
   }

   iree_hal_amdgpu_target_compatibility_t first_mismatch =
       IREE_HAL_AMDGPU_TARGET_COMPATIBILITY_COMPATIBLE;
   char first_agent_target[128] = {0};
   for (iree_host_size_t i = 0; i < available_isas.count; ++i) {
     iree_hal_amdgpu_agent_isa_target_t isa_target;
     IREE_RETURN_IF_ERROR(iree_hal_amdgpu_query_agent_isa_target(
         libhsa, available_isas.values[i], &isa_target));
     const iree_hal_amdgpu_target_compatibility_t compatibility =
         iree_hal_amdgpu_target_id_check_compatible(code_object_target_id,
                                                    &isa_target.target_id);
     if (compatibility == IREE_HAL_AMDGPU_TARGET_COMPATIBILITY_COMPATIBLE) {
       return iree_ok_status();
     }
     if (first_mismatch == IREE_HAL_AMDGPU_TARGET_COMPATIBILITY_COMPATIBLE) {
       first_mismatch = compatibility;
       IREE_RETURN_IF_ERROR(
           iree_hal_amdgpu_executable_format_target_id_for_message(
               &isa_target.target_id, sizeof(first_agent_target),
               first_agent_target));
     }
   }

   char code_object_target[128] = {0};
   IREE_RETURN_IF_ERROR(iree_hal_amdgpu_executable_format_target_id_for_message(
       code_object_target_id, sizeof(code_object_target), code_object_target));
   char mismatch_reasons[128] = {0};
   IREE_RETURN_IF_ERROR(iree_hal_amdgpu_target_compatibility_format(
       first_mismatch, sizeof(mismatch_reasons), mismatch_reasons,
       /*out_buffer_length=*/NULL));
   return iree_make_status(
       IREE_STATUS_INCOMPATIBLE,
       "AMDGPU code object target `%s` is not compatible with GPU agent[%" PRIhsz
       "] target `%s` (mismatched %s)",
       code_object_target, physical_device_ordinal, first_agent_target,
       mismatch_reasons);
 }

 static iree_status_t iree_hal_amdgpu_executable_preflight_code_object(
     const iree_hal_amdgpu_libhsa_t* libhsa,
     const iree_hal_amdgpu_topology_t* topology,
     const iree_hal_amdgpu_queue_affinity_physical_device_set_t*
         physical_devices,
     const iree_hal_amdgpu_target_id_t* code_object_target_id) {
   for (iree_host_size_t i = 0; i < topology->gpu_agent_count; ++i) {
     if (!iree_hal_amdgpu_physical_device_mask_contains(
             physical_devices->physical_device_mask, i)) {
       continue;
     }
     IREE_RETURN_IF_ERROR(iree_hal_amdgpu_executable_preflight_agent_code_object(
         libhsa, topology->gpu_agents[i], i, code_object_target_id));
   }
   return iree_ok_status();
 }

 // Loads an executable ELF from memory for selected agents in |topology| and
 // stores the frozen executable in |out_handle|.
 static iree_status_t iree_hal_amdgpu_executable_load_module(
     const iree_hal_amdgpu_libhsa_t* libhsa,
     const iree_hal_amdgpu_topology_t* topology,
     const iree_hal_amdgpu_queue_affinity_physical_device_set_t*
         physical_devices,
     const iree_hal_executable_params_t* executable_params,
     iree_const_byte_span_t code_object_data, hsa_executable_t* out_handle) {
   IREE_TRACE_ZONE_BEGIN(z0);
   *out_handle = (hsa_executable_t){0};

   // TODO(#18877): support executable constants in HSA executables.
   // We currently don't support executable constants but we could by way of
   // global symbols. We should be using externs for the constants and then
   // hsa_executable_readonly_variable_define to specify each. We have to
   // allocate one copy of the constant table per agent. I don't know if it's
   // best to have one base symbol pointing to the table or one symbol per
   // constant in the table.
   if (executable_params->constant_count != 0) {
     IREE_RETURN_AND_END_ZONE_IF_ERROR(
         z0, iree_make_status(IREE_STATUS_UNIMPLEMENTED,
                              "executable constants not yet implemented"));
   }

   // TODO(benvanik): figure out what options we could pass? Documentation is ...
   // lacking. These may have only been used for HSAIL anyway.
   const char* options = NULL;

   iree_hal_amdgpu_target_id_t code_object_target_id;
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
       z0, iree_hal_amdgpu_code_object_target_id_from_elf(
               code_object_data, &code_object_target_id));
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
       z0, iree_hal_amdgpu_executable_preflight_code_object(
               libhsa, topology, physical_devices, &code_object_target_id));

   // Bind a code object reader to the memory sourced from our rodata.
   hsa_code_object_reader_t code_object_reader;
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
       z0, iree_hsa_code_object_reader_create_from_memory(
               IREE_LIBHSA(libhsa), (const char*)code_object_data.data,
               code_object_data.data_length, &code_object_reader));

   // Create the executable that will hold all of the loaded code objects.
   // TODO(benvanik): pass profile/rounding mode from queried info.
   hsa_executable_t handle = {0};
   iree_status_t status = iree_hsa_executable_create_alt(
       IREE_LIBHSA(libhsa), HSA_PROFILE_FULL,
       HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, options, &handle);

   // Load the code object for each selected agent.
   for (iree_host_size_t i = 0;
        iree_status_is_ok(status) && i < topology->gpu_agent_count; ++i) {
     if (!iree_hal_amdgpu_physical_device_mask_contains(
             physical_devices->physical_device_mask, i)) {
       continue;
     }
     status = iree_hsa_executable_load_agent_code_object(
         IREE_LIBHSA(libhsa), handle, topology->gpu_agents[i],
         code_object_reader, options, NULL);
   }

   // Freeze the executable now that loading has completed. Most queries require
   // that the executable be frozen.
   if (iree_status_is_ok(status)) {
     status = iree_hsa_executable_freeze(IREE_LIBHSA(libhsa), handle, options);
   }

   // Release the reader now that the executable has been fully loaded.
   status =
       iree_status_join(status, iree_hsa_code_object_reader_destroy(
                                    IREE_LIBHSA(libhsa), code_object_reader));

   if (iree_status_is_ok(status)) {
     *out_handle = handle;
   } else if (handle.handle) {
     status = iree_status_join(
         status, iree_hsa_executable_destroy(IREE_LIBHSA(libhsa), handle));
   }
   IREE_TRACE_ZONE_END(z0);
   return status;
 }

 typedef struct iree_hal_amdgpu_executable_find_loaded_code_object_state_t {
   // Borrowed HSA API table used for loader extension queries.
   const iree_hal_amdgpu_libhsa_t* libhsa;
   // HSA agent whose loaded code object is being searched.
   hsa_agent_t agent;
   // Loaded code object matching |agent| when found.
   hsa_loaded_code_object_t loaded_code_object;
 } iree_hal_amdgpu_executable_find_loaded_code_object_state_t;

 static hsa_status_t iree_hal_amdgpu_executable_iterate_loaded_code_object(
     hsa_executable_t executable, hsa_loaded_code_object_t loaded_code_object,
     void* user_data) {
   iree_hal_amdgpu_executable_find_loaded_code_object_state_t* find_state =
       (iree_hal_amdgpu_executable_find_loaded_code_object_state_t*)user_data;
   hsa_agent_t agent = {0};
   hsa_status_t hsa_status =
       find_state->libhsa->amd_loader
           .hsa_ven_amd_loader_loaded_code_object_get_info(
               loaded_code_object,
               HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_AGENT, &agent);
   if (hsa_status != HSA_STATUS_SUCCESS) return hsa_status;
   if (agent.handle == find_state->agent.handle) {
     find_state->loaded_code_object = loaded_code_object;
     return HSA_STATUS_INFO_BREAK;
   }
   return HSA_STATUS_SUCCESS;
 }

 static iree_status_t iree_hal_amdgpu_executable_find_loaded_code_object(
     const iree_hal_amdgpu_libhsa_t* libhsa, hsa_executable_t executable,
     hsa_agent_t agent, hsa_loaded_code_object_t* out_loaded_code_object) {
   *out_loaded_code_object = (hsa_loaded_code_object_t){0};
   iree_hal_amdgpu_executable_find_loaded_code_object_state_t find_state = {
       .libhsa = libhsa,
       .agent = agent,
       .loaded_code_object = {0},
   };
   hsa_status_t hsa_status =
       libhsa->amd_loader
           .hsa_ven_amd_loader_executable_iterate_loaded_code_objects(
               executable, iree_hal_amdgpu_executable_iterate_loaded_code_object,
               &find_state);
   if (hsa_status == HSA_STATUS_SUCCESS) {
     return iree_make_status(IREE_STATUS_NOT_FOUND,
                             "no loaded code object found for agent");
   }
   if (hsa_status != HSA_STATUS_INFO_BREAK) {
     return iree_status_from_hsa_status(
         __FILE__, __LINE__, hsa_status,
         "hsa_ven_amd_loader_executable_iterate_loaded_code_objects",
         "iterating loaded executable code objects");
   }
   *out_loaded_code_object = find_state.loaded_code_object;
   return iree_ok_status();
 }

 static iree_status_t
 iree_hal_amdgpu_executable_populate_profile_code_object_load_info(
     const iree_hal_amdgpu_libhsa_t* libhsa, hsa_executable_t executable,
     uint32_t physical_device_ordinal, hsa_agent_t device_agent,
     iree_hal_amdgpu_profile_code_object_load_info_t* out_load_info) {
   memset(out_load_info, 0, sizeof(*out_load_info));
   out_load_info->physical_device_ordinal = physical_device_ordinal;

   hsa_loaded_code_object_t loaded_code_object = {0};
   IREE_RETURN_IF_ERROR(iree_hal_amdgpu_executable_find_loaded_code_object(
       libhsa, executable, device_agent, &loaded_code_object));

   hsa_status_t hsa_status =
       libhsa->amd_loader.hsa_ven_amd_loader_loaded_code_object_get_info(
           loaded_code_object,
           HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_DELTA,
           &out_load_info->load_delta);
   if (hsa_status == HSA_STATUS_SUCCESS) {
     hsa_status =
         libhsa->amd_loader.hsa_ven_amd_loader_loaded_code_object_get_info(
             loaded_code_object,
             HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_SIZE,
             &out_load_info->load_size);
   }
   return iree_status_from_hsa_status(
       __FILE__, __LINE__, hsa_status,
       "hsa_ven_amd_loader_loaded_code_object_get_info",
       "querying loaded executable code-object profile metadata");
 }

 #define IREE_HAL_AMDGPU_MAX_STACK_SYMBOL_NAME_LENGTH \
   ((iree_host_size_t)(4 * 1024))

 static iree_status_t iree_hal_amdgpu_executable_get_symbol_by_cstring(
     const iree_hal_amdgpu_libhsa_t* libhsa, hsa_executable_t executable,
     const char* symbol_name, hsa_agent_t device_agent,
     hsa_executable_symbol_t* out_symbol) {
   // Kernel callers pass the `.kd` suffix; globals use their source symbol name.
   return iree_hsa_executable_get_symbol_by_name(
       IREE_LIBHSA(libhsa), executable, symbol_name, &device_agent, out_symbol);
 }

 static iree_status_t iree_hal_amdgpu_executable_get_raw_hsaco_symbol_by_name(
     const iree_hal_amdgpu_libhsa_t* libhsa, hsa_executable_t executable,
     iree_string_view_t symbol_name, hsa_agent_t device_agent,
     hsa_executable_symbol_t* out_symbol) {
   if (iree_string_view_is_empty(symbol_name)) {
     return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
                             "executable kernel symbol name is empty");
   }
   if (symbol_name.size > IREE_HAL_AMDGPU_MAX_STACK_SYMBOL_NAME_LENGTH) {
     return iree_make_status(
         IREE_STATUS_OUT_OF_RANGE,
         "executable kernel symbol name `%.*s` exceeds maximum length %" PRIhsz,
         (int)symbol_name.size, symbol_name.data,
         IREE_HAL_AMDGPU_MAX_STACK_SYMBOL_NAME_LENGTH);
   }

   // AMDGPU MessagePack strings are length-delimited and not NUL-terminated.
   // Copy only at the HSA API boundary so ROCR can use its internal symbol map.
   char* symbol_name_storage = (char*)iree_alloca(symbol_name.size + 1);
   memcpy(symbol_name_storage, symbol_name.data, symbol_name.size);
   symbol_name_storage[symbol_name.size] = 0;
   return iree_hal_amdgpu_executable_get_symbol_by_cstring(
       libhsa, executable, symbol_name_storage, device_agent, out_symbol);
 }

 static iree_status_t iree_hal_amdgpu_executable_get_global_symbol_by_name(
     const iree_hal_amdgpu_libhsa_t* libhsa, hsa_executable_t executable,
     iree_string_view_t symbol_name, hsa_agent_t device_agent,
     hsa_executable_symbol_t* out_symbol) {
   if (iree_string_view_is_empty(symbol_name)) {
     return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
                             "executable global name is empty");
   }
   if (symbol_name.size > IREE_HAL_AMDGPU_MAX_STACK_SYMBOL_NAME_LENGTH) {
     return iree_make_status(
         IREE_STATUS_OUT_OF_RANGE,
         "executable global name `%.*s` exceeds maximum length %" PRIhsz,
         (int)symbol_name.size, symbol_name.data,
         IREE_HAL_AMDGPU_MAX_STACK_SYMBOL_NAME_LENGTH);
   }

   char* symbol_name_storage = (char*)iree_alloca(symbol_name.size + 1);
   memcpy(symbol_name_storage, symbol_name.data, symbol_name.size);
   symbol_name_storage[symbol_name.size] = 0;
   return iree_hal_amdgpu_executable_get_symbol_by_cstring(
       libhsa, executable, symbol_name_storage, device_agent, out_symbol);
 }

 // Resolves the uniform kernel arguments that are the same on all GPU device
 // agents in the topology (since we assume all are the same device type).
 // All fields besides `kernel_object` will have valid values.
 static iree_status_t iree_hal_amdgpu_executable_resolve_kernel_args_from_symbol(
     const iree_hal_amdgpu_libhsa_t* libhsa, hsa_executable_symbol_t symbol,
     const uint32_t workgroup_size[3], uint16_t constant_count,
     uint16_t binding_count,
     iree_hal_amdgpu_device_kernel_args_t* out_kernel_args) {
   IREE_ASSERT_ARGUMENT(out_kernel_args);
   IREE_TRACE_ZONE_BEGIN(z0);

   // All of our kernels assume 3 dimensions.
   out_kernel_args->setup = 3 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;

   out_kernel_args->workgroup_size[0] = workgroup_size[0];
   out_kernel_args->workgroup_size[1] = workgroup_size[1];
   out_kernel_args->workgroup_size[2] = workgroup_size[2];

   // NOTE: the object pointer is per-device and we populate that when uploading
   // device tables.
   out_kernel_args->kernel_object = 0;

   // Segment size information used to populate dispatch packets and reserve
   // space.
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
       z0, iree_hsa_executable_symbol_get_info(
               IREE_LIBHSA(libhsa), symbol,
               HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
               &out_kernel_args->private_segment_size));
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
       z0, iree_hsa_executable_symbol_get_info(
               IREE_LIBHSA(libhsa), symbol,
               HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
               &out_kernel_args->group_segment_size));
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
       z0, iree_hsa_executable_symbol_get_info(
               IREE_LIBHSA(libhsa), symbol,
               HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
               &out_kernel_args->kernarg_size));
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
       z0, iree_hsa_executable_symbol_get_info(
               IREE_LIBHSA(libhsa), symbol,
               HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT,
               &out_kernel_args->kernarg_alignment));

   out_kernel_args->binding_count = binding_count;
   out_kernel_args->constant_count = constant_count;

   IREE_TRACE_ZONE_END(z0);
   return iree_ok_status();
 }

 static iree_status_t iree_hal_amdgpu_executable_resolve_kernel_object(
     const iree_hal_amdgpu_libhsa_t* libhsa, hsa_executable_symbol_t symbol,
     uint64_t* out_kernel_object) {
   return iree_hsa_executable_symbol_get_info(
       IREE_LIBHSA(libhsa), symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT,
       out_kernel_object);
 }

 // Uploads the provided kernel table to |device_agent| and returns the pointer.
 // |host_kernel_args| must already have device-specific `kernel_object` fields.
 static iree_status_t iree_hal_amdgpu_executable_upload_resolved_kernel_table(
     const iree_hal_amdgpu_libhsa_t* libhsa, iree_host_size_t kernel_count,
     iree_hal_amdgpu_device_kernel_args_t* host_kernel_args,
     hsa_agent_t device_agent,
     IREE_AMDGPU_DEVICE_PTR const iree_hal_amdgpu_device_kernel_args_t**
         out_device_kernel_args) {
   IREE_TRACE_ZONE_BEGIN(z0);
   *out_device_kernel_args = NULL;

   if (kernel_count == 0) {
     IREE_TRACE_ZONE_END(z0);
     return iree_ok_status();
   }

   // Find a memory pool on the agent where we can upload the table.
   hsa_amd_memory_pool_t memory_pool = {0};
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
       z0,
       iree_hal_amdgpu_find_coarse_global_memory_pool(libhsa, device_agent,
                                                      &memory_pool),
       "finding memory pool for storing kernel arg tables");

   // Allocate device kernel argument storage.
   const iree_host_size_t kernel_args_table_size =
       kernel_count * sizeof(host_kernel_args[0]);
   iree_hal_amdgpu_device_kernel_args_t* device_kernel_args = NULL;
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
       z0, iree_hsa_amd_memory_pool_allocate(
               IREE_LIBHSA(libhsa), memory_pool, kernel_args_table_size,
               HSA_AMD_MEMORY_POOL_STANDARD_FLAG, (void**)&device_kernel_args));

   // Copy the entire table to device memory.
   iree_status_t status =
       iree_hsa_memory_copy(IREE_LIBHSA(libhsa), device_kernel_args,
                            host_kernel_args, kernel_args_table_size);

   if (iree_status_is_ok(status)) {
     *out_device_kernel_args = device_kernel_args;
   } else {
     status = iree_status_join(
         status,
         iree_hsa_amd_memory_pool_free(IREE_LIBHSA(libhsa), device_kernel_args));
   }
   IREE_TRACE_ZONE_END(z0);
   return status;
 }

 static iree_status_t iree_hal_amdgpu_executable_upload_flatbuffer_kernel_table(
     const iree_hal_amdgpu_libhsa_t* libhsa, hsa_executable_t executable,
     iree_hal_amdgpu_ExportDef_vec_t export_defs,
     iree_hal_amdgpu_device_kernel_args_t* host_kernel_args,
     hsa_agent_t device_agent,
     IREE_AMDGPU_DEVICE_PTR const iree_hal_amdgpu_device_kernel_args_t**
         out_device_kernel_args) {
   const iree_host_size_t kernel_count =
       iree_hal_amdgpu_ExportDef_vec_len(export_defs);
   for (iree_host_size_t kernel_ordinal = 0; kernel_ordinal < kernel_count;
        ++kernel_ordinal) {
     iree_hal_amdgpu_ExportDef_table_t export_def =
         iree_hal_amdgpu_ExportDef_vec_at(export_defs, kernel_ordinal);
     flatbuffers_string_t symbol_name =
         iree_hal_amdgpu_ExportDef_symbol_name_get(export_def);
     iree_string_view_t symbol_name_view =
         symbol_name ? iree_make_string_view(symbol_name,
                                             flatbuffers_string_len(symbol_name))
                     : iree_string_view_empty();
     if (iree_string_view_is_empty(symbol_name_view)) {
       return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
                               "executable kernel symbol name is empty");
     }
     hsa_executable_symbol_t symbol = {0};
     IREE_RETURN_IF_ERROR(
         iree_hal_amdgpu_executable_get_symbol_by_cstring(
             libhsa, executable, symbol_name, device_agent, &symbol),
         "resolving `%.*s`", (int)symbol_name_view.size, symbol_name_view.data);
     IREE_RETURN_IF_ERROR(iree_hal_amdgpu_executable_resolve_kernel_object(
         libhsa, symbol, &host_kernel_args[kernel_ordinal].kernel_object));
   }
   return iree_hal_amdgpu_executable_upload_resolved_kernel_table(
       libhsa, kernel_count, host_kernel_args, device_agent,
       out_device_kernel_args);
 }

 static iree_status_t iree_hal_amdgpu_executable_upload_raw_hsaco_kernel_table(
     const iree_hal_amdgpu_libhsa_t* libhsa, hsa_executable_t executable,
     const iree_hal_amdgpu_hsaco_metadata_t* hsaco_metadata,
     iree_hal_amdgpu_device_kernel_args_t* host_kernel_args,
     hsa_agent_t device_agent,
     IREE_AMDGPU_DEVICE_PTR const iree_hal_amdgpu_device_kernel_args_t**
         out_device_kernel_args) {
   for (iree_host_size_t kernel_ordinal = 0;
        kernel_ordinal < hsaco_metadata->kernel_count; ++kernel_ordinal) {
     iree_string_view_t symbol_name =
         hsaco_metadata->kernels[kernel_ordinal].symbol_name;
     hsa_executable_symbol_t symbol = {0};
     IREE_RETURN_IF_ERROR(
         iree_hal_amdgpu_executable_get_raw_hsaco_symbol_by_name(
             libhsa, executable, symbol_name, device_agent, &symbol),
         "resolving `%.*s`", (int)symbol_name.size, symbol_name.data);
     IREE_RETURN_IF_ERROR(iree_hal_amdgpu_executable_resolve_kernel_object(
         libhsa, symbol, &host_kernel_args[kernel_ordinal].kernel_object));
   }
   return iree_hal_amdgpu_executable_upload_resolved_kernel_table(
       libhsa, hsaco_metadata->kernel_count, host_kernel_args, device_agent,
       out_device_kernel_args);
 }

 static iree_status_t iree_hal_amdgpu_executable_calculate_kernarg_block_count(
     const iree_hal_amdgpu_device_dispatch_kernarg_layout_t* layout,
     uint32_t* out_kernarg_block_count) {
   iree_host_size_t kernarg_block_count = iree_host_size_ceil_div(
       layout->total_kernarg_size, sizeof(iree_hal_amdgpu_kernarg_block_t));
   if (kernarg_block_count == 0) {
     kernarg_block_count = 1;
   }
   if (IREE_UNLIKELY(kernarg_block_count > UINT32_MAX)) {
     return iree_make_status(
         IREE_STATUS_OUT_OF_RANGE,
         "dispatch kernargs require too many blocks (%" PRIhsz ", max=%u)",
         kernarg_block_count, UINT32_MAX);
   }
   *out_kernarg_block_count = (uint32_t)kernarg_block_count;
   return iree_ok_status();
 }

 static iree_status_t iree_hal_amdgpu_executable_initialize_dispatch_descriptor(
     const iree_hal_amdgpu_device_kernel_args_t* kernel_args,
     iree_hal_amdgpu_executable_dispatch_descriptor_t* out_descriptor) {
   memset(out_descriptor, 0, sizeof(*out_descriptor));

   if (IREE_UNLIKELY(
           !iree_host_size_is_power_of_two(kernel_args->kernarg_alignment))) {
     return iree_make_status(
         IREE_STATUS_FAILED_PRECONDITION,
         "executable kernel kernarg alignment must be a power of two (got %u)",
         kernel_args->kernarg_alignment);
   }
   if (IREE_UNLIKELY(kernel_args->kernarg_alignment >
                     iree_alignof(iree_hal_amdgpu_kernarg_block_t))) {
     return iree_make_status(
         IREE_STATUS_FAILED_PRECONDITION,
         "executable kernel kernarg alignment %u exceeds queue kernarg ring "
         "alignment %" PRIhsz,
         kernel_args->kernarg_alignment,
         (iree_host_size_t)iree_alignof(iree_hal_amdgpu_kernarg_block_t));
   }

   out_descriptor->kernel_args = *kernel_args;
   // Kernel metadata reports the bytes actually consumed by the compiled
   // kernel. That may be smaller than the HAL ABI explicit argument footprint
   // when optimization removes unused bindings/constants; we still reserve and
   // populate the public HAL ABI bytes for dispatch.
   out_descriptor->hal_kernarg_layout =
       iree_hal_amdgpu_device_dispatch_make_hal_kernarg_layout(kernel_args);
   IREE_RETURN_IF_ERROR(iree_hal_amdgpu_executable_calculate_kernarg_block_count(
       &out_descriptor->hal_kernarg_layout,
       &out_descriptor->hal_kernarg_block_count));

   out_descriptor->custom_kernarg_layout =
       iree_hal_amdgpu_device_dispatch_make_custom_kernarg_layout(
           kernel_args->kernarg_size);
   IREE_RETURN_IF_ERROR(iree_hal_amdgpu_executable_calculate_kernarg_block_count(
       &out_descriptor->custom_kernarg_layout,
       &out_descriptor->custom_kernarg_block_count));

   for (iree_host_size_t i = 0; i < 3; ++i) {
     if (IREE_UNLIKELY(kernel_args->workgroup_size[i] == 0)) {
       return iree_make_status(
           IREE_STATUS_FAILED_PRECONDITION,
           "executable kernel workgroup size dimension %" PRIhsz " is zero", i);
     }
     out_descriptor->max_workgroup_count[i] =
         UINT32_MAX / kernel_args->workgroup_size[i];
   }
   out_descriptor->max_dynamic_workgroup_local_memory =
       UINT32_MAX - kernel_args->group_segment_size;
   return iree_ok_status();
 }

 //===----------------------------------------------------------------------===//
 // iree_hal_amdgpu_executable_t
 //===----------------------------------------------------------------------===//

 typedef struct iree_hal_amdgpu_executable_t {
   // HAL executable resource header.
   iree_hal_resource_t resource;
   // Host allocator used for executable-owned metadata tables.
   iree_allocator_t host_allocator;

   // Unowned HSA API handle. Must remain valid for the lifetime of the
   // executable.
   const iree_hal_amdgpu_libhsa_t* libhsa;

   // Borrowed HAL device used for global-buffer placement metadata.
   iree_hal_device_t* device;

   // Loaded HSA executable with a code object for each device.
   hsa_executable_t handle;

   // Producer-local profile executable id assigned at creation.
   uint64_t profile_id;
   // Stable content hash for the exact loaded HSACO/code-object bytes.
   uint64_t profile_code_object_hash[2];

   // Total number of exports in the executable.
   iree_host_size_t kernel_count;
   // Host-resident reflection information for each export.
   iree_hal_executable_export_info_t* export_infos /*[kernel_count]*/;
   // Prefix-sum offsets into |export_parameters| for each export plus a
   // sentinel.
   iree_host_size_t* export_parameter_offsets /*[kernel_count + 1]*/;
   // Host-resident parameter reflection records for all exports.
   iree_hal_executable_export_parameter_t* export_parameters;
   // Table of kernel args stored in host memory. We have them local so that
   // host-side command buffer recording doesn't need to access device memory.
   // The kernel object specified in each is invalid as it's agent-specific.
   iree_hal_amdgpu_device_kernel_args_t* host_kernel_args /*[kernel_count]*/;
   // Host-resident dispatch descriptors stored as [device_count][kernel_count].
   iree_hal_amdgpu_executable_dispatch_descriptor_t*
       host_dispatch_descriptors /*[device_count * kernel_count]*/;

   // Queue affinity this executable was loaded for after normalization.
   iree_hal_queue_affinity_t queue_affinity;
   // Queue-affinity domain used to resolve per-device globals.
   iree_hal_amdgpu_queue_affinity_domain_t queue_affinity_domain;
   // Bitmask of physical GPU device ordinals with loaded code objects.
   uint64_t loaded_physical_device_mask;
   // Number of loaded physical GPU devices in |loaded_physical_device_mask|.
   iree_host_size_t loaded_physical_device_count;
   // Total number of GPU devices in the topology used for per-device tables.
   iree_host_size_t device_count;
   // HSA GPU agents indexed by physical device ordinal.
   hsa_agent_t* device_agents /*[device_count]*/;
   // Table of kernel args stored in device memory, one copy per device.
   // Selected devices have an entire `kernel_count` set of args; unselected
   // devices remain NULL and fail lookup.
   IREE_AMDGPU_DEVICE_PTR const iree_hal_amdgpu_device_kernel_args_t*
       device_kernel_args[/*device_count*/];
 } iree_hal_amdgpu_executable_t;

 static const iree_hal_executable_vtable_t iree_hal_amdgpu_executable_vtable;

 static iree_hal_amdgpu_executable_t* iree_hal_amdgpu_executable_cast(
     iree_hal_executable_t* base_value) {
   IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_amdgpu_executable_vtable);
   return (iree_hal_amdgpu_executable_t*)base_value;
 }

 static const iree_hal_amdgpu_executable_t*
 iree_hal_amdgpu_executable_const_cast(const iree_hal_executable_t* base_value) {
   IREE_HAL_ASSERT_TYPE(base_value, &iree_hal_amdgpu_executable_vtable);
   return (const iree_hal_amdgpu_executable_t*)base_value;
 }

 static iree_string_view_t iree_hal_amdgpu_executable_flatbuffer_string_view(
     flatbuffers_string_t value) {
   return value ? iree_make_string_view(value, flatbuffers_string_len(value))
                : iree_string_view_empty();
 }

 static iree_string_view_t iree_hal_amdgpu_executable_export_reflection_name(
     iree_hal_amdgpu_ExportDef_table_t export_def) {
   iree_hal_debug_ExportDef_table_t debug_def =
       iree_hal_amdgpu_ExportDef_debug_info_get(export_def);
   if (debug_def) {
     iree_string_view_t debug_name =
         iree_hal_amdgpu_executable_flatbuffer_string_view(
             iree_hal_debug_ExportDef_name_get(debug_def));
     if (!iree_string_view_is_empty(debug_name)) return debug_name;
   }

   iree_string_view_t symbol_name =
       iree_hal_amdgpu_executable_flatbuffer_string_view(
           iree_hal_amdgpu_ExportDef_symbol_name_get(export_def));
   return iree_string_view_strip_suffix(symbol_name, IREE_SV(".kd"));
 }

 static iree_status_t iree_hal_amdgpu_executable_allocate(
     iree_hal_device_t* device, const iree_hal_amdgpu_libhsa_t* libhsa,
     const iree_hal_amdgpu_topology_t* topology,
     const iree_hal_amdgpu_queue_affinity_physical_device_set_t*
         physical_devices,
     iree_host_size_t export_count, iree_host_size_t export_name_storage_size,
     iree_host_size_t export_parameter_count,
     iree_host_size_t export_parameter_name_storage_size,
     iree_allocator_t host_allocator, char** out_export_name_storage,
     char** out_export_parameter_name_storage,
     iree_hal_amdgpu_executable_t** out_executable) {
   *out_export_name_storage = NULL;
   *out_export_parameter_name_storage = NULL;
   *out_executable = NULL;

   iree_host_size_t dispatch_descriptor_count = 0;
   if (!iree_host_size_checked_mul(topology->gpu_agent_count, export_count,
                                   &dispatch_descriptor_count)) {
     return iree_make_status(IREE_STATUS_OUT_OF_RANGE,
                             "dispatch descriptor table size overflow");
   }

   iree_host_size_t export_parameter_offset_count = 0;
   if (!iree_host_size_checked_add(export_count, 1,
                                   &export_parameter_offset_count)) {
     return iree_make_status(IREE_STATUS_OUT_OF_RANGE,
                             "export parameter offset table size overflow");
   }

   iree_host_size_t total_size = 0;
   iree_host_size_t export_infos_offset = 0;
   iree_host_size_t export_name_storage_offset = 0;
   iree_host_size_t export_parameter_offsets_offset = 0;
   iree_host_size_t export_parameters_offset = 0;
   iree_host_size_t export_parameter_name_storage_offset = 0;
   iree_host_size_t host_kernel_args_offset = 0;
   iree_host_size_t host_dispatch_descriptors_offset = 0;
   iree_host_size_t device_agents_offset = 0;
   IREE_RETURN_IF_ERROR(IREE_STRUCT_LAYOUT(
       sizeof(iree_hal_amdgpu_executable_t), &total_size,
       IREE_STRUCT_FIELD_FAM(
           topology->gpu_agent_count,
           IREE_AMDGPU_DEVICE_PTR const iree_hal_amdgpu_device_kernel_args_t*),
       IREE_STRUCT_FIELD(export_count, iree_hal_executable_export_info_t,
                         &export_infos_offset),
       IREE_STRUCT_FIELD(export_name_storage_size, char,
                         &export_name_storage_offset),
       IREE_STRUCT_FIELD(export_parameter_offset_count, iree_host_size_t,
                         &export_parameter_offsets_offset),
       IREE_STRUCT_FIELD(export_parameter_count,
                         iree_hal_executable_export_parameter_t,
                         &export_parameters_offset),
       IREE_STRUCT_FIELD(export_parameter_name_storage_size, char,
                         &export_parameter_name_storage_offset),
       IREE_STRUCT_FIELD(export_count, iree_hal_amdgpu_device_kernel_args_t,
                         &host_kernel_args_offset),
       IREE_STRUCT_FIELD(dispatch_descriptor_count,
                         iree_hal_amdgpu_executable_dispatch_descriptor_t,
                         &host_dispatch_descriptors_offset),
       IREE_STRUCT_FIELD(topology->gpu_agent_count, hsa_agent_t,
                         &device_agents_offset)));

   iree_hal_amdgpu_executable_t* executable = NULL;
   IREE_RETURN_IF_ERROR(
       iree_allocator_malloc(host_allocator, total_size, (void**)&executable));
   memset(executable, 0, total_size);
   iree_hal_resource_initialize(&iree_hal_amdgpu_executable_vtable,
                                &executable->resource);
   executable->host_allocator = host_allocator;
   executable->libhsa = libhsa;
   executable->device = device;
   executable->kernel_count = export_count;
   uint8_t* executable_storage = (uint8_t*)executable;
   executable->export_infos =
       (iree_hal_executable_export_info_t*)(executable_storage +
                                            export_infos_offset);
   executable->export_parameter_offsets =
       (iree_host_size_t*)(executable_storage + export_parameter_offsets_offset);
   executable->export_parameters =
       export_parameter_count
           ? (iree_hal_executable_export_parameter_t*)(executable_storage +
                                                       export_parameters_offset)
           : NULL;
   executable->host_kernel_args =
       (iree_hal_amdgpu_device_kernel_args_t*)(executable_storage +
                                               host_kernel_args_offset);
   executable->host_dispatch_descriptors =
       (iree_hal_amdgpu_executable_dispatch_descriptor_t*)(executable_storage +
                                                           host_dispatch_descriptors_offset);
   executable->device_agents =
       (hsa_agent_t*)(executable_storage + device_agents_offset);
   memcpy(executable->device_agents, topology->gpu_agents,
          topology->gpu_agent_count * sizeof(executable->device_agents[0]));
   executable->queue_affinity = physical_devices->queue_affinity;
   executable->queue_affinity_domain = (iree_hal_amdgpu_queue_affinity_domain_t){
       .supported_affinity = physical_devices->queue_affinity,
       .physical_device_count = topology->gpu_agent_count,
       .queue_count_per_physical_device = topology->gpu_agent_queue_count,
   };
   executable->loaded_physical_device_mask =
       physical_devices->physical_device_mask;
   executable->loaded_physical_device_count =
       physical_devices->physical_device_count;
   executable->device_count = topology->gpu_agent_count;

   *out_export_name_storage =
       (char*)executable_storage + export_name_storage_offset;
   *out_export_parameter_name_storage =
       (char*)executable_storage + export_parameter_name_storage_offset;
   *out_executable = executable;
   return iree_ok_status();
 }

 static void iree_hal_amdgpu_executable_invalidate_host_kernel_objects(
     iree_hal_amdgpu_executable_t* executable) {
   if (!executable) return;
   for (iree_host_size_t kernel_ordinal = 0;
        kernel_ordinal < executable->kernel_count; ++kernel_ordinal) {
     executable->host_kernel_args[kernel_ordinal].kernel_object = 0;
   }
 }

 static iree_status_t
 iree_hal_amdgpu_executable_initialize_dispatch_descriptors_for_device(
     iree_hal_amdgpu_executable_t* executable, iree_host_size_t device_ordinal) {
   for (iree_host_size_t kernel_ordinal = 0;
        kernel_ordinal < executable->kernel_count; ++kernel_ordinal) {
     const iree_host_size_t descriptor_ordinal =
         device_ordinal * executable->kernel_count + kernel_ordinal;
     IREE_RETURN_IF_ERROR(
         iree_hal_amdgpu_executable_initialize_dispatch_descriptor(
             &executable->host_kernel_args[kernel_ordinal],
             &executable->host_dispatch_descriptors[descriptor_ordinal]),
         "initializing dispatch descriptor for device %" PRIhsz
         " export %" PRIhsz,
         device_ordinal, kernel_ordinal);
   }
   return iree_ok_status();
 }

 static iree_status_t iree_hal_amdgpu_executable_register_profile_artifacts(
     const iree_hal_amdgpu_libhsa_t* libhsa,
     const iree_hal_amdgpu_topology_t* topology,
     iree_hal_amdgpu_profile_metadata_registry_t* profile_metadata,
     iree_const_byte_span_t code_object_data,
     iree_hal_amdgpu_executable_t* executable) {
   IREE_TRACE_ZONE_BEGIN(z0);

   iree_host_size_t load_info_storage_size = 0;
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
       z0, IREE_STRUCT_LAYOUT(
               0, &load_info_storage_size,
               IREE_STRUCT_FIELD(executable->loaded_physical_device_count,
                                 iree_hal_amdgpu_profile_code_object_load_info_t,
                                 NULL)));

   iree_hal_amdgpu_profile_code_object_load_info_t* load_infos = NULL;
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
       z0, iree_allocator_malloc(executable->host_allocator,
                                 load_info_storage_size, (void**)&load_infos));

   iree_status_t status = iree_ok_status();
   iree_host_size_t load_info_ordinal = 0;
   for (iree_host_size_t device_ordinal = 0;
        device_ordinal < topology->gpu_agent_count && iree_status_is_ok(status);
        ++device_ordinal) {
     if (!iree_hal_amdgpu_physical_device_mask_contains(
             executable->loaded_physical_device_mask, device_ordinal)) {
       continue;
     }
     if (IREE_UNLIKELY(device_ordinal > UINT32_MAX)) {
       status = iree_make_status(
           IREE_STATUS_OUT_OF_RANGE,
           "profile executable physical device ordinal exceeds uint32_t");
     } else {
       status =
           iree_hal_amdgpu_executable_populate_profile_code_object_load_info(
               libhsa, executable->handle, (uint32_t)device_ordinal,
               topology->gpu_agents[device_ordinal],
               &load_infos[load_info_ordinal]);
       if (iree_status_is_ok(status)) ++load_info_ordinal;
     }
   }
   if (iree_status_is_ok(status)) {
     status = iree_hal_amdgpu_profile_metadata_register_executable_artifacts(
         profile_metadata, executable->profile_id, code_object_data,
         executable->profile_code_object_hash,
         executable->loaded_physical_device_count, load_infos);
   }

   iree_allocator_free(executable->host_allocator, load_infos);
   IREE_TRACE_ZONE_END(z0);
   return status;
 }

 static iree_status_t iree_hal_amdgpu_executable_register_profile_metadata(
     const iree_hal_amdgpu_libhsa_t* libhsa,
     const iree_hal_amdgpu_topology_t* topology,
     iree_hal_amdgpu_profile_metadata_registry_t* profile_metadata,
     iree_const_byte_span_t code_object_data,
     iree_hal_amdgpu_executable_t* executable) {
   IREE_RETURN_IF_ERROR(iree_hal_amdgpu_profile_metadata_register_executable(
       profile_metadata, executable->kernel_count, executable->export_infos,
       executable->export_parameter_offsets,
       executable->profile_code_object_hash, executable->host_kernel_args,
       &executable->profile_id));

   // Executable trace profiling may begin after executable preparation. Preserve
   // exact code-object bytes and loader load ranges while |code_object_data| is
   // still in scope so later ATT capture can always emit a self-contained
   // profile.
   return iree_hal_amdgpu_executable_register_profile_artifacts(
       libhsa, topology, profile_metadata, code_object_data, executable);
 }

 static iree_status_t
 iree_hal_amdgpu_executable_validate_export_parameter_requirements(
     iree_hal_amdgpu_ExportDef_table_t export_def,
     iree_string_view_t symbol_name,
     const iree_hal_amdgpu_hsaco_metadata_export_parameter_requirements_t*
         requirements) {
   // The flatbuffer owns the HAL ABI counts for wrapped executables. HSACO
   // metadata may omit arguments that LLVM optimized away, but it must not
   // require more visible arguments than the flatbuffer layout can supply.
   const uint32_t expected_constant_count =
       iree_hal_amdgpu_ExportDef_constant_count_get(export_def);
   if (requirements->constant_count > expected_constant_count) {
     return iree_make_status(
         IREE_STATUS_INVALID_ARGUMENT,
         "HSACO metadata for export `%.*s` declares %u reflected constants but "
         "ExecutableDef only declares %u",
         (int)symbol_name.size, symbol_name.data,
         (uint32_t)requirements->constant_count, expected_constant_count);
   }

   iree_hal_amdgpu_BindingBits_vec_t binding_flags =
       iree_hal_amdgpu_ExportDef_binding_flags_get(export_def);
   const iree_host_size_t expected_binding_count =
       iree_hal_amdgpu_BindingBits_vec_len(binding_flags);
   if (requirements->binding_count > expected_binding_count) {
     return iree_make_status(
         IREE_STATUS_INVALID_ARGUMENT,
         "HSACO metadata for export `%.*s` declares %u reflected bindings but "
         "ExecutableDef only declares %" PRIhsz,
         (int)symbol_name.size, symbol_name.data,
         (uint32_t)requirements->binding_count, expected_binding_count);
   }
   return iree_ok_status();
 }

 static iree_status_t iree_hal_amdgpu_executable_calculate_reflection_storage(
     iree_hal_amdgpu_ExportDef_vec_t export_defs,
     const iree_hal_amdgpu_hsaco_metadata_t* hsaco_metadata,
     iree_host_size_t* out_export_name_storage_size,
     iree_host_size_t* out_export_parameter_count,
     iree_host_size_t* out_export_parameter_name_storage_size) {
   iree_host_size_t export_name_storage_size = 0;
   iree_host_size_t export_parameter_count = 0;
   iree_host_size_t export_parameter_name_storage_size = 0;
   const iree_host_size_t export_count =
       iree_hal_amdgpu_ExportDef_vec_len(export_defs);
   for (iree_host_size_t i = 0; i < export_count; ++i) {
     iree_hal_amdgpu_ExportDef_table_t export_def =
         iree_hal_amdgpu_ExportDef_vec_at(export_defs, i);
     iree_string_view_t name =
         iree_hal_amdgpu_executable_export_reflection_name(export_def);
     if (!iree_host_size_checked_add(export_name_storage_size, name.size,
                                     &export_name_storage_size)) {
       return iree_make_status(IREE_STATUS_OUT_OF_RANGE,
                               "export name storage size overflow");
     }

     iree_string_view_t symbol_name =
         iree_hal_amdgpu_executable_flatbuffer_string_view(
             iree_hal_amdgpu_ExportDef_symbol_name_get(export_def));
     const iree_hal_amdgpu_hsaco_metadata_kernel_t* kernel = NULL;
     IREE_RETURN_IF_ERROR(iree_hal_amdgpu_hsaco_metadata_find_kernel_by_symbol(
                              hsaco_metadata, symbol_name, &kernel),
                          "looking up HSACO metadata for export `%.*s`",
                          (int)symbol_name.size, symbol_name.data);

     iree_hal_amdgpu_hsaco_metadata_export_parameter_requirements_t requirements;
     IREE_RETURN_IF_ERROR(
         iree_hal_amdgpu_hsaco_metadata_calculate_default_export_parameter_requirements(
             kernel, &requirements),
         "projecting HSACO parameters for export `%.*s`", (int)symbol_name.size,
         symbol_name.data);
     IREE_RETURN_IF_ERROR(
         iree_hal_amdgpu_executable_validate_export_parameter_requirements(
             export_def, symbol_name, &requirements));

     if (!iree_host_size_checked_add(export_parameter_count,
                                     requirements.parameter_count,
                                     &export_parameter_count) ||
         !iree_host_size_checked_add(export_parameter_name_storage_size,
                                     requirements.name_storage_size,
                                     &export_parameter_name_storage_size)) {
       return iree_make_status(
           IREE_STATUS_OUT_OF_RANGE,
           "export parameter reflection storage size overflow");
     }
   }
   *out_export_name_storage_size = export_name_storage_size;
   *out_export_parameter_count = export_parameter_count;
   *out_export_parameter_name_storage_size = export_parameter_name_storage_size;
   return iree_ok_status();
 }

 static iree_status_t iree_hal_amdgpu_executable_verify_raw_hsaco_kernel(
     const iree_hal_amdgpu_hsaco_metadata_kernel_t* kernel,
     const iree_hal_amdgpu_device_limits_t* limits) {
   if (!kernel->has_required_workgroup_size) {
     return iree_ok_status();
   }

   const uint32_t* workgroup_size = kernel->required_workgroup_size;
   if (workgroup_size[0] > limits->max_workgroup_size_per_dim[0] ||
       workgroup_size[1] > limits->max_workgroup_size_per_dim[1] ||
       workgroup_size[2] > limits->max_workgroup_size_per_dim[2]) {
     return iree_make_status(
         IREE_STATUS_INVALID_ARGUMENT,
         "raw HSACO kernel `%.*s` workgroup size dims %ux%ux%u exceed device "
         "maximum %ux%ux%u",
         (int)kernel->symbol_name.size, kernel->symbol_name.data,
         workgroup_size[0], workgroup_size[1], workgroup_size[2],
         limits->max_workgroup_size_per_dim[0],
         limits->max_workgroup_size_per_dim[1],
         limits->max_workgroup_size_per_dim[2]);
   }
   const uint64_t total_workgroup_size =
       (uint64_t)workgroup_size[0] * workgroup_size[1] * workgroup_size[2];
   if (total_workgroup_size > limits->max_workgroup_size) {
     return iree_make_status(
         IREE_STATUS_INVALID_ARGUMENT,
         "raw HSACO kernel `%.*s` workgroup size total %" PRIu64
         " exceeds device "
         "maximum %u",
         (int)kernel->symbol_name.size, kernel->symbol_name.data,
         total_workgroup_size, limits->max_workgroup_size);
   }
   return iree_ok_status();
 }

 static void iree_hal_amdgpu_executable_raw_hsaco_workgroup_size(
     const iree_hal_amdgpu_hsaco_metadata_kernel_t* kernel,
     uint32_t out_workgroup_size[3]) {
   if (kernel->has_required_workgroup_size) {
     out_workgroup_size[0] = kernel->required_workgroup_size[0];
     out_workgroup_size[1] = kernel->required_workgroup_size[1];
     out_workgroup_size[2] = kernel->required_workgroup_size[2];
   } else {
     // Raw HSACO without `.reqd_workgroup_size` is represented as a dynamic
     // workgroup-size export with 1x1x1 minimum granularity. The actual launch
     // geometry must come from the dispatch config.
     out_workgroup_size[0] = 1;
     out_workgroup_size[1] = 1;
     out_workgroup_size[2] = 1;
   }
 }

 static iree_status_t
 iree_hal_amdgpu_executable_calculate_raw_hsaco_reflection_storage(
     const iree_hal_amdgpu_hsaco_metadata_t* hsaco_metadata,
     const iree_hal_amdgpu_device_limits_t* limits,
     iree_host_size_t* out_export_name_storage_size,
     iree_host_size_t* out_export_parameter_count,
     iree_host_size_t* out_export_parameter_name_storage_size) {
   if (iree_string_view_is_empty(hsaco_metadata->target)) {
     return iree_make_status(
         IREE_STATUS_INVALID_ARGUMENT,
         "raw HSACO metadata is missing `amdhsa.target`; direct loading "
         "requires the code object to declare its target ISA");
   }

   iree_host_size_t export_name_storage_size = 0;
   iree_host_size_t export_parameter_count = 0;
   iree_host_size_t export_parameter_name_storage_size = 0;
   for (iree_host_size_t i = 0; i < hsaco_metadata->kernel_count; ++i) {
     const iree_hal_amdgpu_hsaco_metadata_kernel_t* kernel =
         &hsaco_metadata->kernels[i];
     IREE_RETURN_IF_ERROR(
         iree_hal_amdgpu_executable_verify_raw_hsaco_kernel(kernel, limits));
     if (!iree_host_size_checked_add(export_name_storage_size,
                                     kernel->reflection_name.size,
                                     &export_name_storage_size)) {
       return iree_make_status(IREE_STATUS_OUT_OF_RANGE,
                               "export name storage size overflow");
     }

     iree_hal_amdgpu_hsaco_metadata_export_parameter_requirements_t requirements;
     IREE_RETURN_IF_ERROR(
         iree_hal_amdgpu_hsaco_metadata_calculate_default_export_parameter_requirements(
             kernel, &requirements),
         "projecting HSACO parameters for raw kernel `%.*s`",
         (int)kernel->symbol_name.size, kernel->symbol_name.data);
     if (!iree_host_size_checked_add(export_parameter_count,
                                     requirements.parameter_count,
                                     &export_parameter_count) ||
         !iree_host_size_checked_add(export_parameter_name_storage_size,
                                     requirements.name_storage_size,
                                     &export_parameter_name_storage_size)) {
       return iree_make_status(
           IREE_STATUS_OUT_OF_RANGE,
           "export parameter reflection storage size overflow");
     }
   }
   *out_export_name_storage_size = export_name_storage_size;
   *out_export_parameter_count = export_parameter_count;
   *out_export_parameter_name_storage_size = export_parameter_name_storage_size;
   return iree_ok_status();
 }

 static iree_status_t iree_hal_amdgpu_executable_initialize_export_infos(
     iree_hal_amdgpu_ExportDef_vec_t export_defs,
     const iree_hal_amdgpu_hsaco_metadata_t* hsaco_metadata,
     iree_hal_executable_export_info_t* export_infos,
     iree_host_size_t* export_parameter_offsets,
     iree_hal_executable_export_parameter_t* export_parameters,
     char* export_name_storage, char* export_parameter_name_storage) {
   iree_host_size_t export_parameter_offset = 0;
   const iree_host_size_t export_count =
       iree_hal_amdgpu_ExportDef_vec_len(export_defs);
   for (iree_host_size_t i = 0; i < export_count; ++i) {
     iree_hal_amdgpu_ExportDef_table_t export_def =
         iree_hal_amdgpu_ExportDef_vec_at(export_defs, i);
     iree_hal_executable_export_info_t* info = &export_infos[i];
     export_parameter_offsets[i] = export_parameter_offset;

     iree_string_view_t symbol_name =
         iree_hal_amdgpu_executable_flatbuffer_string_view(
             iree_hal_amdgpu_ExportDef_symbol_name_get(export_def));
     const iree_hal_amdgpu_hsaco_metadata_kernel_t* kernel = NULL;
     IREE_RETURN_IF_ERROR(iree_hal_amdgpu_hsaco_metadata_find_kernel_by_symbol(
                              hsaco_metadata, symbol_name, &kernel),
                          "looking up HSACO metadata for export `%.*s`",
                          (int)symbol_name.size, symbol_name.data);

     iree_hal_amdgpu_hsaco_metadata_export_parameter_requirements_t requirements;
     IREE_RETURN_IF_ERROR(
         iree_hal_amdgpu_hsaco_metadata_calculate_default_export_parameter_requirements(
             kernel, &requirements),
         "projecting HSACO parameters for export `%.*s`", (int)symbol_name.size,
         symbol_name.data);
     IREE_RETURN_IF_ERROR(
         iree_hal_amdgpu_executable_validate_export_parameter_requirements(
             export_def, symbol_name, &requirements));

     iree_string_view_t name =
         iree_hal_amdgpu_executable_export_reflection_name(export_def);
     if (!iree_string_view_is_empty(name)) {
       memcpy(export_name_storage, name.data, name.size);
     }

     memset(info, 0, sizeof(*info));
     info->name = iree_make_string_view(export_name_storage, name.size);
     info->flags = IREE_HAL_EXECUTABLE_EXPORT_FLAG_NONE;
     // Preserve the flatbuffer ABI counts even when the HSACO metadata has lost
     // optimized-unused arguments.
     info->constant_count =
         (uint16_t)iree_hal_amdgpu_ExportDef_constant_count_get(export_def);
     iree_hal_amdgpu_BindingBits_vec_t binding_flags =
         iree_hal_amdgpu_ExportDef_binding_flags_get(export_def);
     info->binding_count =
         (uint16_t)iree_hal_amdgpu_BindingBits_vec_len(binding_flags);
     info->parameter_count = requirements.parameter_count;
     const iree_hal_amdgpu_Dims_struct_t workgroup_size =
         iree_hal_amdgpu_ExportDef_workgroup_size_get(export_def);
     info->workgroup_size[0] = workgroup_size->x;
     info->workgroup_size[1] = workgroup_size->y;
     info->workgroup_size[2] = workgroup_size->z;

     iree_hal_executable_export_parameter_t* export_parameter_base =
         requirements.parameter_count
             ? &export_parameters[export_parameter_offset]
             : NULL;
     char* export_parameter_name_base =
         requirements.name_storage_size ? export_parameter_name_storage : NULL;
     IREE_RETURN_IF_ERROR(
         iree_hal_amdgpu_hsaco_metadata_populate_default_export_parameters(
             kernel, requirements.parameter_count, export_parameter_base,
             requirements.name_storage_size, export_parameter_name_base),
         "populating reflected parameters for export `%.*s`",
         (int)symbol_name.size, symbol_name.data);

     export_name_storage += name.size;
     export_parameter_offset += requirements.parameter_count;
     export_parameter_name_storage += requirements.name_storage_size;
   }
   export_parameter_offsets[export_count] = export_parameter_offset;
   return iree_ok_status();
 }

 static iree_status_t
 iree_hal_amdgpu_executable_initialize_raw_hsaco_export_infos(
     const iree_hal_amdgpu_hsaco_metadata_t* hsaco_metadata,
     iree_hal_executable_export_info_t* export_infos,
     iree_host_size_t* export_parameter_offsets,
     iree_hal_executable_export_parameter_t* export_parameters,
     char* export_name_storage, char* export_parameter_name_storage) {
   iree_host_size_t export_parameter_offset = 0;
   for (iree_host_size_t i = 0; i < hsaco_metadata->kernel_count; ++i) {
     const iree_hal_amdgpu_hsaco_metadata_kernel_t* kernel =
         &hsaco_metadata->kernels[i];
     iree_hal_executable_export_info_t* info = &export_infos[i];
     export_parameter_offsets[i] = export_parameter_offset;

     iree_hal_amdgpu_hsaco_metadata_export_parameter_requirements_t requirements;
     IREE_RETURN_IF_ERROR(
         iree_hal_amdgpu_hsaco_metadata_calculate_default_export_parameter_requirements(
             kernel, &requirements),
         "projecting HSACO parameters for raw kernel `%.*s`",
         (int)kernel->symbol_name.size, kernel->symbol_name.data);

     iree_string_view_t name = kernel->reflection_name;
     if (!iree_string_view_is_empty(name)) {
       memcpy(export_name_storage, name.data, name.size);
     }

     memset(info, 0, sizeof(*info));
     info->name = iree_make_string_view(export_name_storage, name.size);
     info->flags = kernel->has_required_workgroup_size
                       ? IREE_HAL_EXECUTABLE_EXPORT_FLAG_NONE
                       : IREE_HAL_EXECUTABLE_EXPORT_FLAG_WORKGROUP_SIZE_DYNAMIC;
     info->constant_count = requirements.constant_count;
     info->binding_count = requirements.binding_count;
     info->parameter_count = requirements.parameter_count;
     iree_hal_amdgpu_executable_raw_hsaco_workgroup_size(kernel,
                                                         info->workgroup_size);

     iree_hal_executable_export_parameter_t* export_parameter_base =
         requirements.parameter_count
             ? &export_parameters[export_parameter_offset]
             : NULL;
     char* export_parameter_name_base =
         requirements.name_storage_size ? export_parameter_name_storage : NULL;
     IREE_RETURN_IF_ERROR(
         iree_hal_amdgpu_hsaco_metadata_populate_default_export_parameters(
             kernel, requirements.parameter_count, export_parameter_base,
             requirements.name_storage_size, export_parameter_name_base),
         "populating reflected parameters for raw kernel `%.*s`",
         (int)kernel->symbol_name.size, kernel->symbol_name.data);

     export_name_storage += name.size;
     export_parameter_offset += requirements.parameter_count;
     export_parameter_name_storage += requirements.name_storage_size;
   }
   export_parameter_offsets[hsaco_metadata->kernel_count] =
       export_parameter_offset;
   return iree_ok_status();
 }

 static iree_status_t iree_hal_amdgpu_executable_resolve_flatbuffer_kernel_args(
     const iree_hal_amdgpu_libhsa_t* libhsa, hsa_executable_t executable,
     iree_hal_amdgpu_ExportDef_vec_t export_defs, hsa_agent_t any_device_agent,
     iree_hal_amdgpu_device_kernel_args_t* host_kernel_args) {
   const iree_host_size_t kernel_count =
       iree_hal_amdgpu_ExportDef_vec_len(export_defs);
   for (iree_host_size_t kernel_ordinal = 0; kernel_ordinal < kernel_count;
        ++kernel_ordinal) {
     iree_hal_amdgpu_ExportDef_table_t export_def =
         iree_hal_amdgpu_ExportDef_vec_at(export_defs, kernel_ordinal);
     flatbuffers_string_t symbol_name =
         iree_hal_amdgpu_ExportDef_symbol_name_get(export_def);
     iree_string_view_t symbol_name_view =
         iree_hal_amdgpu_executable_flatbuffer_string_view(symbol_name);
     if (iree_string_view_is_empty(symbol_name_view)) {
       return iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
                               "executable kernel symbol name is empty");
     }

     hsa_executable_symbol_t symbol = {0};
     IREE_RETURN_IF_ERROR(
         iree_hal_amdgpu_executable_get_symbol_by_cstring(
             libhsa, executable, symbol_name, any_device_agent, &symbol),
         "looking up HSA symbol for export `%.*s`", (int)symbol_name_view.size,
         symbol_name_view.data);

     const iree_hal_amdgpu_Dims_struct_t flatbuffer_workgroup_size =
         iree_hal_amdgpu_ExportDef_workgroup_size_get(export_def);
     const uint32_t workgroup_size[3] = {
         flatbuffer_workgroup_size->x,
         flatbuffer_workgroup_size->y,
         flatbuffer_workgroup_size->z,
     };
     const uint16_t constant_count =
         (uint16_t)iree_hal_amdgpu_ExportDef_constant_count_get(export_def);
     iree_hal_amdgpu_BindingBits_vec_t binding_bits =
         iree_hal_amdgpu_ExportDef_binding_flags_get(export_def);
     const uint16_t binding_count =
         (uint16_t)iree_hal_amdgpu_BindingBits_vec_len(binding_bits);
     IREE_RETURN_IF_ERROR(
         iree_hal_amdgpu_executable_resolve_kernel_args_from_symbol(
             libhsa, symbol, workgroup_size, constant_count, binding_count,
             &host_kernel_args[kernel_ordinal]),
         "resolving kernel args for `%.*s`", (int)symbol_name_view.size,
         symbol_name_view.data);
   }
   return iree_ok_status();
 }

 static iree_status_t iree_hal_amdgpu_executable_resolve_raw_hsaco_kernel_args(
     const iree_hal_amdgpu_libhsa_t* libhsa, hsa_executable_t executable,
     const iree_hal_amdgpu_hsaco_metadata_t* hsaco_metadata,
     hsa_agent_t any_device_agent,
     iree_hal_amdgpu_device_kernel_args_t* host_kernel_args) {
   for (iree_host_size_t kernel_ordinal = 0;
        kernel_ordinal < hsaco_metadata->kernel_count; ++kernel_ordinal) {
     const iree_hal_amdgpu_hsaco_metadata_kernel_t* kernel =
         &hsaco_metadata->kernels[kernel_ordinal];
     iree_string_view_t symbol_name = kernel->symbol_name;

     hsa_executable_symbol_t symbol = {0};
     IREE_RETURN_IF_ERROR(
         iree_hal_amdgpu_executable_get_raw_hsaco_symbol_by_name(
             libhsa, executable, symbol_name, any_device_agent, &symbol),
         "looking up HSA symbol for raw kernel `%.*s`", (int)symbol_name.size,
         symbol_name.data);

     iree_hal_amdgpu_hsaco_metadata_export_parameter_requirements_t requirements;
     IREE_RETURN_IF_ERROR(
         iree_hal_amdgpu_hsaco_metadata_calculate_default_export_parameter_requirements(
             kernel, &requirements),
         "projecting HSACO parameters for raw kernel `%.*s`",
         (int)symbol_name.size, symbol_name.data);

     uint32_t workgroup_size[3] = {0};
     iree_hal_amdgpu_executable_raw_hsaco_workgroup_size(kernel, workgroup_size);
     IREE_RETURN_IF_ERROR(
         iree_hal_amdgpu_executable_resolve_kernel_args_from_symbol(
             libhsa, symbol, workgroup_size, requirements.constant_count,
             requirements.binding_count, &host_kernel_args[kernel_ordinal]),
         "resolving kernel args for raw kernel `%.*s`", (int)symbol_name.size,
         symbol_name.data);
   }
   return iree_ok_status();
 }

 static iree_status_t iree_hal_amdgpu_executable_create_from_flatbuffer(
     iree_hal_device_t* device, const iree_hal_amdgpu_libhsa_t* libhsa,
     const iree_hal_amdgpu_topology_t* topology,
     const iree_hal_amdgpu_queue_affinity_physical_device_set_t*
         physical_devices,
     const iree_hal_executable_params_t* executable_params,
     const iree_hal_amdgpu_device_limits_t* limits, hsa_agent_t any_device_agent,
     iree_hal_amdgpu_profile_metadata_registry_t* profile_metadata,
     iree_allocator_t host_allocator, iree_hal_executable_t** out_executable) {
   *out_executable = NULL;

   iree_const_byte_span_t executable_flatbuffer = iree_const_byte_span_empty();
   iree_hal_amdgpu_ExecutableDef_table_t executable_def = 0;
   iree_hal_amdgpu_ExportDef_vec_t export_defs = 0;
   iree_const_byte_span_t code_object_data = iree_const_byte_span_empty();
   iree_host_size_t export_count = 0;
   iree_status_t status = iree_hal_read_executable_flatbuffer_header(
       executable_params->executable_data, /*unsafe_infer_size=*/false,
       iree_hal_amdgpu_ExecutableDef_file_identifier, &executable_flatbuffer);
   if (iree_status_is_ok(status)) {
     status = iree_hal_amdgpu_executable_flatbuffer_verify(executable_flatbuffer,
                                                           limits);
   }
   if (iree_status_is_ok(status)) {
     executable_def =
         iree_hal_amdgpu_ExecutableDef_as_root(executable_flatbuffer.data);
     export_defs = iree_hal_amdgpu_ExecutableDef_exports_get(executable_def);
     export_count = iree_hal_amdgpu_ExportDef_vec_len(export_defs);
     iree_hal_amdgpu_ModuleDef_vec_t module_defs =
         iree_hal_amdgpu_ExecutableDef_modules_get(executable_def);
     status = iree_hal_amdgpu_executable_get_single_module_image(
         module_defs, &code_object_data);
   }

   iree_hal_amdgpu_hsaco_metadata_t hsaco_metadata = {0};
   if (iree_status_is_ok(status)) {
     status = iree_hal_amdgpu_hsaco_metadata_initialize_from_elf(
         code_object_data, host_allocator, &hsaco_metadata);
   }

   iree_host_size_t export_name_storage_size = 0;
   iree_host_size_t export_parameter_count = 0;
   iree_host_size_t export_parameter_name_storage_size = 0;
   if (iree_status_is_ok(status)) {
     status = iree_hal_amdgpu_executable_calculate_reflection_storage(
         export_defs, &hsaco_metadata, &export_name_storage_size,
         &export_parameter_count, &export_parameter_name_storage_size);
   }

   iree_hal_amdgpu_executable_t* executable = NULL;
   char* export_name_storage = NULL;
   char* export_parameter_name_storage = NULL;
   if (iree_status_is_ok(status)) {
     status = iree_hal_amdgpu_executable_allocate(
         device, libhsa, topology, physical_devices, export_count,
         export_name_storage_size, export_parameter_count,
         export_parameter_name_storage_size, host_allocator,
         &export_name_storage, &export_parameter_name_storage, &executable);
   }
   if (iree_status_is_ok(status)) {
     status = iree_hal_amdgpu_executable_initialize_export_infos(
         export_defs, &hsaco_metadata, executable->export_infos,
         executable->export_parameter_offsets, executable->export_parameters,
         export_name_storage, export_parameter_name_storage);
   }
   if (iree_status_is_ok(status)) {
     iree_hal_amdgpu_profile_metadata_hash_code_object(
         code_object_data, executable->profile_code_object_hash);
   }

   // Publish any embedded source files to the tracing infrastructure.
   if (iree_status_is_ok(status)) {
     iree_hal_debug_publish_source_files(
         iree_hal_amdgpu_ExecutableDef_source_files_get(executable_def));
   }

   // Load executable and register it with selected GPU agents.
   if (iree_status_is_ok(status)) {
     status = iree_hal_amdgpu_executable_load_module(
         libhsa, topology, physical_devices, executable_params, code_object_data,
         &executable->handle);
   }

   // Resolve kernel args for each export.
   // These parameters should be the same for all devices as we require all
   // devices have the same ISA. The only thing that will differ is the
   // kernel_object pointer and we handle that per-device during table upload.
   if (iree_status_is_ok(status)) {
     status = iree_hal_amdgpu_executable_resolve_flatbuffer_kernel_args(
         libhsa, executable->handle, export_defs, any_device_agent,
         executable->host_kernel_args);
   }

   // Upload copies of kernel arguments for each device.
   // We reuse the host storage we already allocated to make it possible to
   // memcpy the entire table in one go from host memory.
   for (iree_host_size_t device_ordinal = 0;
        iree_status_is_ok(status) && device_ordinal < executable->device_count;
        ++device_ordinal) {
     if (!iree_hal_amdgpu_physical_device_mask_contains(
             executable->loaded_physical_device_mask, device_ordinal)) {
       continue;
     }
     status = iree_hal_amdgpu_executable_upload_flatbuffer_kernel_table(
         libhsa, executable->handle, export_defs, executable->host_kernel_args,
         topology->gpu_agents[device_ordinal],
         &executable->device_kernel_args[device_ordinal]);
     if (iree_status_is_ok(status)) {
       status =
           iree_hal_amdgpu_executable_initialize_dispatch_descriptors_for_device(
               executable, device_ordinal);
     }
   }

   if (iree_status_is_ok(status)) {
     status = iree_hal_amdgpu_executable_register_profile_metadata(
         libhsa, topology, profile_metadata, code_object_data, executable);
   }

   // Invalidate the kernel object pointer in all host args so that we don't
   // accidentally use it instead of the device-specific one.
   if (iree_status_is_ok(status)) {
     iree_hal_amdgpu_executable_invalidate_host_kernel_objects(executable);
   }

   iree_hal_amdgpu_hsaco_metadata_deinitialize(&hsaco_metadata);

   if (iree_status_is_ok(status)) {
     *out_executable = (iree_hal_executable_t*)executable;
   } else if (executable) {
     iree_hal_executable_destroy((iree_hal_executable_t*)executable);
   }
   return status;
 }

 static iree_status_t iree_hal_amdgpu_executable_create_from_raw_hsaco(
     iree_hal_device_t* device, const iree_hal_amdgpu_libhsa_t* libhsa,
     const iree_hal_amdgpu_topology_t* topology,
     const iree_hal_amdgpu_queue_affinity_physical_device_set_t*
         physical_devices,
     const iree_hal_executable_params_t* executable_params,
     const iree_hal_amdgpu_device_limits_t* limits, hsa_agent_t any_device_agent,
     iree_hal_amdgpu_profile_metadata_registry_t* profile_metadata,
     iree_allocator_t host_allocator, iree_hal_executable_t** out_executable) {
   *out_executable = NULL;

   iree_const_byte_span_t code_object_data = executable_params->executable_data;
   iree_hal_amdgpu_hsaco_metadata_t hsaco_metadata = {0};
   iree_status_t status = iree_hal_amdgpu_hsaco_metadata_initialize_from_elf(
       code_object_data, host_allocator, &hsaco_metadata);

   iree_host_size_t export_name_storage_size = 0;
   iree_host_size_t export_parameter_count = 0;
   iree_host_size_t export_parameter_name_storage_size = 0;
   if (iree_status_is_ok(status)) {
     status = iree_hal_amdgpu_executable_calculate_raw_hsaco_reflection_storage(
         &hsaco_metadata, limits, &export_name_storage_size,
         &export_parameter_count, &export_parameter_name_storage_size);
   }

   iree_hal_amdgpu_executable_t* executable = NULL;
   char* export_name_storage = NULL;
   char* export_parameter_name_storage = NULL;
   if (iree_status_is_ok(status)) {
     status = iree_hal_amdgpu_executable_allocate(
         device, libhsa, topology, physical_devices, hsaco_metadata.kernel_count,
         export_name_storage_size, export_parameter_count,
         export_parameter_name_storage_size, host_allocator,
         &export_name_storage, &export_parameter_name_storage, &executable);
   }
   if (iree_status_is_ok(status)) {
     status = iree_hal_amdgpu_executable_initialize_raw_hsaco_export_infos(
         &hsaco_metadata, executable->export_infos,
         executable->export_parameter_offsets, executable->export_parameters,
         export_name_storage, export_parameter_name_storage);
   }
   if (iree_status_is_ok(status)) {
     iree_hal_amdgpu_profile_metadata_hash_code_object(
         code_object_data, executable->profile_code_object_hash);
   }

   // Load executable and register it with selected GPU agents.
   if (iree_status_is_ok(status)) {
     status = iree_hal_amdgpu_executable_load_module(
         libhsa, topology, physical_devices, executable_params, code_object_data,
         &executable->handle);
   }

   // Resolve kernel args for each export.
   // These parameters should be the same for all devices as we require all
   // devices have the same ISA. The only thing that will differ is the
   // kernel_object pointer and we handle that per-device during table upload.
   if (iree_status_is_ok(status)) {
     status = iree_hal_amdgpu_executable_resolve_raw_hsaco_kernel_args(
         libhsa, executable->handle, &hsaco_metadata, any_device_agent,
         executable->host_kernel_args);
   }

   // Upload copies of kernel arguments for each device.
   // We reuse the host storage we already allocated to make it possible to
   // memcpy the entire table in one go from host memory.
   for (iree_host_size_t device_ordinal = 0;
        iree_status_is_ok(status) && device_ordinal < executable->device_count;
        ++device_ordinal) {
     if (!iree_hal_amdgpu_physical_device_mask_contains(
             executable->loaded_physical_device_mask, device_ordinal)) {
       continue;
     }
     status = iree_hal_amdgpu_executable_upload_raw_hsaco_kernel_table(
         libhsa, executable->handle, &hsaco_metadata,
         executable->host_kernel_args, topology->gpu_agents[device_ordinal],
         &executable->device_kernel_args[device_ordinal]);
     if (iree_status_is_ok(status)) {
       status =
           iree_hal_amdgpu_executable_initialize_dispatch_descriptors_for_device(
               executable, device_ordinal);
     }
   }

   if (iree_status_is_ok(status)) {
     status = iree_hal_amdgpu_executable_register_profile_metadata(
         libhsa, topology, profile_metadata, code_object_data, executable);
   }

   // Invalidate the kernel object pointer in all host args so that we don't
   // accidentally use it instead of the device-specific one.
   if (iree_status_is_ok(status)) {
     iree_hal_amdgpu_executable_invalidate_host_kernel_objects(executable);
   }

   iree_hal_amdgpu_hsaco_metadata_deinitialize(&hsaco_metadata);

   if (iree_status_is_ok(status)) {
     *out_executable = (iree_hal_executable_t*)executable;
   } else if (executable) {
     iree_hal_executable_destroy((iree_hal_executable_t*)executable);
   }
   return status;
 }

 iree_status_t iree_hal_amdgpu_executable_create(
     iree_hal_device_t* device, const iree_hal_amdgpu_libhsa_t* libhsa,
     const iree_hal_amdgpu_topology_t* topology,
     const iree_hal_executable_params_t* executable_params,
     iree_hal_amdgpu_profile_metadata_registry_t* profile_metadata,
     iree_allocator_t host_allocator, iree_hal_executable_t** out_executable) {
   IREE_ASSERT_ARGUMENT(executable_params);
   IREE_ASSERT_ARGUMENT(profile_metadata);
   IREE_ASSERT_ARGUMENT(out_executable);
   *out_executable = NULL;
   IREE_TRACE_ZONE_BEGIN(z0);

   if (IREE_UNLIKELY(topology->gpu_agent_count == 0)) {
     IREE_RETURN_AND_END_ZONE_IF_ERROR(
         z0, iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
                              "topology must have at least one GPU device"));
   }

   // Resolve the executable queue affinity to the physical devices that need
   // code-object loads and per-device kernel tables.
   iree_hal_amdgpu_queue_affinity_physical_device_set_t physical_devices;
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
       z0, iree_hal_amdgpu_executable_select_physical_devices(
               topology, executable_params->queue_affinity, &physical_devices));

   // Pick a selected device to be our template for device queries. All devices
   // in the topology are expected to be the same. This should have been checked
   // earlier but we do it here in case the user is bypassing that code.
   hsa_agent_t any_device_agent =
       topology->gpu_agents[physical_devices.first_physical_device_ordinal];

   // Check that the executable is supported and get the ISA it matches.
   bool supported = false;
   hsa_isa_t isa = {0};
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
       z0, iree_hal_amdgpu_executable_format_supported(
               libhsa, any_device_agent, executable_params->executable_format,
               &supported, &isa));
   if (!supported) {
     IREE_RETURN_AND_END_ZONE_IF_ERROR(
         z0, iree_make_status(IREE_STATUS_INCOMPATIBLE,
                              "executable format `%.*s` not supported by the "
                              "devices in the topology",
                              (int)executable_params->executable_format.size,
                              executable_params->executable_format.data));
   }

   iree_hal_amdgpu_device_limits_t limits = {0};
   IREE_RETURN_AND_END_ZONE_IF_ERROR(
       z0, iree_hal_amdgpu_query_device_limits(libhsa, any_device_agent, isa,
                                               &limits));

   iree_status_t status = iree_ok_status();
   if (iree_hal_amdgpu_executable_data_is_wrapped_flatbuffer(
           executable_params->executable_data)) {
     status = iree_hal_amdgpu_executable_create_from_flatbuffer(
         device, libhsa, topology, &physical_devices, executable_params, &limits,
         any_device_agent, profile_metadata, host_allocator, out_executable);
   } else {
     status = iree_hal_amdgpu_executable_create_from_raw_hsaco(
         device, libhsa, topology, &physical_devices, executable_params, &limits,
         any_device_agent, profile_metadata, host_allocator, out_executable);
   }
   if (!iree_status_is_ok(status)) {
     iree_hal_executable_release(*out_executable);
     *out_executable = NULL;
   }
   IREE_TRACE_ZONE_END(z0);
   return status;
 }

 uint64_t iree_hal_amdgpu_executable_profile_id(
     iree_hal_executable_t* base_executable) {
   iree_hal_amdgpu_executable_t* executable =
       iree_hal_amdgpu_executable_cast(base_executable);
   return executable->profile_id;
 }

 static void iree_hal_amdgpu_executable_destroy(
     iree_hal_executable_t* base_executable) {
   iree_hal_amdgpu_executable_t* executable =
       iree_hal_amdgpu_executable_cast(base_executable);
   iree_allocator_t host_allocator = executable->host_allocator;
   IREE_TRACE_ZONE_BEGIN(z0);

   for (iree_host_size_t device_ordinal = 0;
        device_ordinal < executable->device_count; ++device_ordinal) {
     void* kernel_args = (void*)executable->device_kernel_args[device_ordinal];
     if (kernel_args) {
       iree_hal_amdgpu_hsa_cleanup_assert_success(
           iree_hsa_amd_memory_pool_free_raw(executable->libhsa, kernel_args));
     }
   }

   if (executable->handle.handle) {
     iree_hal_amdgpu_hsa_cleanup_assert_success(iree_hsa_executable_destroy_raw(
         executable->libhsa, executable->handle));
   }

   iree_allocator_free(host_allocator, executable);

   IREE_TRACE_ZONE_END(z0);
 }

 iree_status_t iree_hal_amdgpu_executable_lookup_kernel_args_for_host(
     iree_hal_executable_t* base_executable,
     iree_hal_executable_export_ordinal_t export_ordinal,
     const iree_hal_amdgpu_device_kernel_args_t** out_kernel_args) {
   const iree_hal_amdgpu_executable_t* executable =
       iree_hal_amdgpu_executable_const_cast(base_executable);
   *out_kernel_args = NULL;

   if (IREE_UNLIKELY(export_ordinal >= executable->kernel_count)) {
     return iree_make_status(IREE_STATUS_OUT_OF_RANGE,
                             "export ordinal %" PRIu32
                             " out of range; executable has %" PRIhsz " exports",
                             export_ordinal, executable->kernel_count);
   }

   *out_kernel_args = &executable->host_kernel_args[export_ordinal];

   return iree_ok_status();
 }

 iree_status_t iree_hal_amdgpu_executable_lookup_kernel_args_for_device(
     iree_hal_executable_t* base_executable,
     iree_hal_executable_export_ordinal_t export_ordinal,
     iree_host_size_t device_ordinal,
     const iree_hal_amdgpu_device_kernel_args_t** out_kernel_args) {
   const iree_hal_amdgpu_executable_t* executable =
       iree_hal_amdgpu_executable_const_cast(base_executable);
   *out_kernel_args = NULL;

   if (IREE_UNLIKELY(export_ordinal >= executable->kernel_count)) {
     return iree_make_status(IREE_STATUS_OUT_OF_RANGE,
                             "export ordinal %" PRIu32
                             " out of range; executable has %" PRIhsz " exports",
                             export_ordinal, executable->kernel_count);
   } else if (IREE_UNLIKELY(device_ordinal >= executable->device_count)) {
     return iree_make_status(IREE_STATUS_OUT_OF_RANGE,
                             "device ordinal %" PRIhsz
                             " out of range; executable topology has %" PRIhsz
                             " physical devices",
                             device_ordinal, executable->device_count);
   } else if (IREE_UNLIKELY(!iree_hal_amdgpu_physical_device_mask_contains(
                  executable->loaded_physical_device_mask, device_ordinal))) {
     return iree_make_status(IREE_STATUS_FAILED_PRECONDITION,
                             "device ordinal %" PRIhsz
                             " is not in executable queue affinity 0x%" PRIx64,
                             device_ordinal, executable->queue_affinity);
   }

   *out_kernel_args =
       &executable->device_kernel_args[device_ordinal][export_ordinal];

   return iree_ok_status();
 }

 iree_status_t iree_hal_amdgpu_executable_lookup_dispatch_descriptor_for_device(
     iree_hal_executable_t* base_executable,
     iree_hal_executable_export_ordinal_t export_ordinal,
     iree_host_size_t device_ordinal,
     const iree_hal_amdgpu_executable_dispatch_descriptor_t** out_descriptor) {
   const iree_hal_amdgpu_executable_t* executable =
       iree_hal_amdgpu_executable_const_cast(base_executable);
   *out_descriptor = NULL;

   if (IREE_UNLIKELY(export_ordinal >= executable->kernel_count)) {
     return iree_make_status(IREE_STATUS_OUT_OF_RANGE,
                             "export ordinal %" PRIu32
                             " out of range; executable has %" PRIhsz " exports",
                             export_ordinal, executable->kernel_count);
   } else if (IREE_UNLIKELY(device_ordinal >= executable->device_count)) {
     return iree_make_status(IREE_STATUS_OUT_OF_RANGE,
                             "device ordinal %" PRIhsz
                             " out of range; executable topology has %" PRIhsz
                             " physical devices",
                             device_ordinal, executable->device_count);
   } else if (IREE_UNLIKELY(!iree_hal_amdgpu_physical_device_mask_contains(
                  executable->loaded_physical_device_mask, device_ordinal))) {
     return iree_make_status(IREE_STATUS_FAILED_PRECONDITION,
                             "device ordinal %" PRIhsz
                             " is not in executable queue affinity 0x%" PRIx64,
                             device_ordinal, executable->queue_affinity);
   }

   const iree_host_size_t descriptor_ordinal =
       device_ordinal * executable->kernel_count + export_ordinal;
   *out_descriptor = &executable->host_dispatch_descriptors[descriptor_ordinal];
   return iree_ok_status();
 }

 static iree_host_size_t iree_hal_amdgpu_executable_export_count(
     iree_hal_executable_t* base_executable) {
   iree_hal_amdgpu_executable_t* executable =
       iree_hal_amdgpu_executable_cast(base_executable);
   return executable->kernel_count;
 }

 static iree_status_t iree_hal_amdgpu_executable_export_info(
     iree_hal_executable_t* base_executable,
     iree_hal_executable_export_ordinal_t export_ordinal,
     iree_hal_executable_export_info_t* out_info) {
   iree_hal_amdgpu_executable_t* executable =
       iree_hal_amdgpu_executable_cast(base_executable);
   memset(out_info, 0, sizeof(*out_info));
   if (IREE_UNLIKELY(export_ordinal >= executable->kernel_count)) {
     return iree_make_status(IREE_STATUS_OUT_OF_RANGE,
                             "export ordinal %" PRIu32
                             " out of range; executable has %" PRIhsz " exports",
                             export_ordinal, executable->kernel_count);
   }
   *out_info = executable->export_infos[export_ordinal];
   return iree_ok_status();
 }

 static iree_status_t iree_hal_amdgpu_executable_export_parameters(
     iree_hal_executable_t* base_executable,
     iree_hal_executable_export_ordinal_t export_ordinal,
     iree_host_size_t capacity,
     iree_hal_executable_export_parameter_t* out_parameters) {
   IREE_ASSERT_ARGUMENT(out_parameters || capacity == 0);
   iree_hal_amdgpu_executable_t* executable =
       iree_hal_amdgpu_executable_cast(base_executable);
   if (IREE_UNLIKELY(export_ordinal >= executable->kernel_count)) {
     return iree_make_status(IREE_STATUS_OUT_OF_RANGE,
                             "export ordinal %" PRIu32
                             " out of range; executable has %" PRIhsz " exports",
                             export_ordinal, executable->kernel_count);
   }
   const iree_host_size_t parameter_begin =
       executable->export_parameter_offsets[export_ordinal];
   const iree_host_size_t parameter_end =
       executable->export_parameter_offsets[export_ordinal + 1];
   const iree_host_size_t parameter_count = parameter_end - parameter_begin;
   const iree_host_size_t copy_count = iree_min(capacity, parameter_count);
   if (copy_count > 0) {
     memcpy(out_parameters, &executable->export_parameters[parameter_begin],
            copy_count * sizeof(out_parameters[0]));
   }
   return iree_ok_status();
 }

 static iree_status_t iree_hal_amdgpu_executable_lookup_export_by_name(
     iree_hal_executable_t* base_executable, iree_string_view_t name,
     iree_hal_executable_export_ordinal_t* out_export_ordinal) {
   iree_hal_amdgpu_executable_t* executable =
       iree_hal_amdgpu_executable_cast(base_executable);
   for (iree_host_size_t i = 0; i < executable->kernel_count; ++i) {
     iree_string_view_t export_name = executable->export_infos[i].name;
     if (iree_string_view_equal(export_name, name)) {
       *out_export_ordinal = (iree_hal_executable_export_ordinal_t)i;
       return iree_ok_status();
     }
   }
   return iree_make_status(IREE_STATUS_NOT_FOUND,
                           "export '%.*s' not found in executable",
                           (int)name.size, name.data);
 }

 static void iree_hal_amdgpu_executable_global_buffer_release(
     void* user_data, iree_hal_buffer_t* buffer) {
   (void)buffer;
   iree_hal_executable_release((iree_hal_executable_t*)user_data);
 }

 static iree_status_t iree_hal_amdgpu_executable_lookup_global_by_name(
     iree_hal_executable_t* base_executable, iree_string_view_t name,
     iree_hal_queue_affinity_t queue_affinity, iree_hal_buffer_t** out_buffer) {
   iree_hal_amdgpu_executable_t* executable =
       iree_hal_amdgpu_executable_cast(base_executable);
   *out_buffer = NULL;

   iree_hal_queue_affinity_t requested_queue_affinity = queue_affinity;
   if (iree_hal_queue_affinity_is_empty(requested_queue_affinity)) {
     requested_queue_affinity = IREE_HAL_QUEUE_AFFINITY_ANY;
   }

   iree_hal_queue_affinity_t selected_queue_affinity = 0;
   iree_host_size_t physical_device_ordinal = 0;
   IREE_RETURN_IF_ERROR(
       iree_hal_amdgpu_queue_affinity_normalize_for_physical_device(
           executable->queue_affinity_domain, requested_queue_affinity,
           &selected_queue_affinity, &physical_device_ordinal));

   hsa_executable_symbol_t symbol = {0};
   IREE_RETURN_IF_ERROR(iree_hal_amdgpu_executable_get_global_symbol_by_name(
       executable->libhsa, executable->handle, name,
       executable->device_agents[physical_device_ordinal], &symbol));

   hsa_symbol_kind_t symbol_kind = HSA_SYMBOL_KIND_KERNEL;
   IREE_RETURN_IF_ERROR(iree_hsa_executable_symbol_get_info(
       IREE_LIBHSA(executable->libhsa), symbol, HSA_EXECUTABLE_SYMBOL_INFO_TYPE,
       &symbol_kind));
   if (symbol_kind != HSA_SYMBOL_KIND_VARIABLE) {
     return iree_make_status(IREE_STATUS_NOT_FOUND,
                             "executable global `%.*s` not found",
                             (int)name.size, name.data);
   }

   uint64_t variable_address = 0;
   IREE_RETURN_IF_ERROR(iree_hsa_executable_symbol_get_info(
       IREE_LIBHSA(executable->libhsa), symbol,
       HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &variable_address));
   uint32_t variable_size = 0;
   IREE_RETURN_IF_ERROR(iree_hsa_executable_symbol_get_info(
       IREE_LIBHSA(executable->libhsa), symbol,
       HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &variable_size));

   iree_hal_buffer_placement_t placement = {
       .device = executable->device,
       .queue_affinity = selected_queue_affinity,
       .flags = IREE_HAL_BUFFER_PLACEMENT_FLAG_NONE,
   };
   iree_hal_buffer_release_callback_t release_callback = {
       .fn = iree_hal_amdgpu_executable_global_buffer_release,
       .user_data = base_executable,
   };
   iree_hal_executable_retain(base_executable);
   iree_status_t status = iree_hal_amdgpu_buffer_create(
       executable->libhsa, placement, IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL,
       IREE_HAL_MEMORY_ACCESS_READ | IREE_HAL_MEMORY_ACCESS_WRITE,
       IREE_HAL_BUFFER_USAGE_DEFAULT, variable_size, variable_size,
       (void*)(uintptr_t)variable_address, release_callback,
       executable->host_allocator, out_buffer);
   if (!iree_status_is_ok(status)) {
     iree_hal_executable_release(base_executable);
   }
   return status;
 }

 static const iree_hal_executable_vtable_t iree_hal_amdgpu_executable_vtable = {
     .destroy = iree_hal_amdgpu_executable_destroy,
     .export_count = iree_hal_amdgpu_executable_export_count,
     .export_info = iree_hal_amdgpu_executable_export_info,
     .export_parameters = iree_hal_amdgpu_executable_export_parameters,
     .lookup_export_by_name = iree_hal_amdgpu_executable_lookup_export_by_name,
     .lookup_global_by_name = iree_hal_amdgpu_executable_lookup_global_by_name,
 };