[HAL/AMDGPU] Name prepublished kernarg storage

Reusable AQL command buffers were materializing prepublished kernarg templates by asking the HAL allocator for DEVICE_LOCAL|HOST_VISIBLE memory and trusting the AMDGPU allocator to resolve that to a fine-grained host-coherent device pool. That was the right local outcome, but the contract was anonymous: a future allocator policy change could silently put template bytes behind a non-coherent mapping and make replay correctness depend on a late flush branch.

Introduce a small AQL prepublished-kernarg storage strategy and record the current device-fine host-coherent strategy on each physical device. Logical-device command-buffer creation passes the selected strategy into the command buffer after queue affinity has been normalized to one physical device, and recording only prepublishes reusable static dispatches when the strategy is enabled.

Finalization now requests DEVICE_LOCAL|HOST_VISIBLE|HOST_COHERENT explicitly and verifies that the returned AMDGPU buffer actually has those memory-type bits before copying templates. The old non-coherent flush fallback is gone because it was not a real strategy. Missing fine-grained device-local pools also get explicit allocator and physical-device diagnostics instead of an opaque pool lookup failure.

The low-level command-buffer tests now use a real heap allocator instead of constructing command buffers with a null allocator, preserving the production object invariant while keeping prepublish storage disabled for tests that only exercise recording or block processing. Add a host-queue integration check for the recorded device-fine strategy alongside the existing real prepublished-dispatch execution test.
diff --git a/runtime/src/iree/hal/drivers/amdgpu/BUILD.bazel b/runtime/src/iree/hal/drivers/amdgpu/BUILD.bazel
index 53ce1e1..d4bb6fb 100644
--- a/runtime/src/iree/hal/drivers/amdgpu/BUILD.bazel
+++ b/runtime/src/iree/hal/drivers/amdgpu/BUILD.bazel
@@ -61,6 +61,7 @@
         "aql_command_buffer.h",
         "aql_command_buffer_profile.c",
         "aql_command_buffer_profile.h",
+        "aql_prepublished_kernarg_storage.h",
         "aql_program_builder.c",
         "aql_program_builder.h",
         "aql_program_validation.c",
@@ -197,6 +198,7 @@
         "aql_block_processor_timestamp.h",
         "aql_command_buffer.h",
         "aql_command_buffer_profile.h",
+        "aql_prepublished_kernarg_storage.h",
         "aql_program_builder.h",
         "aql_program_validation.h",
         "buffer.h",
diff --git a/runtime/src/iree/hal/drivers/amdgpu/CMakeLists.txt b/runtime/src/iree/hal/drivers/amdgpu/CMakeLists.txt
index 45d8689..a6fb9fb 100644
--- a/runtime/src/iree/hal/drivers/amdgpu/CMakeLists.txt
+++ b/runtime/src/iree/hal/drivers/amdgpu/CMakeLists.txt
@@ -71,6 +71,7 @@
     "aql_command_buffer.h"
     "aql_command_buffer_profile.c"
     "aql_command_buffer_profile.h"
+    "aql_prepublished_kernarg_storage.h"
     "aql_program_builder.c"
     "aql_program_builder.h"
     "aql_program_validation.c"
@@ -202,6 +203,7 @@
     "aql_block_processor_timestamp.h"
     "aql_command_buffer.h"
     "aql_command_buffer_profile.h"
+    "aql_prepublished_kernarg_storage.h"
     "aql_program_builder.h"
     "aql_program_validation.h"
     "buffer.h"
diff --git a/runtime/src/iree/hal/drivers/amdgpu/allocator.c b/runtime/src/iree/hal/drivers/amdgpu/allocator.c
index 21a26fc..6a72910 100644
--- a/runtime/src/iree/hal/drivers/amdgpu/allocator.c
+++ b/runtime/src/iree/hal/drivers/amdgpu/allocator.c
@@ -349,6 +349,14 @@
     if (iree_status_is_ok(status)) {
       status = iree_hal_amdgpu_find_fine_global_memory_pool(
           libhsa, topology->gpu_agents[i], &device_fine_pool);
+      if (!iree_status_is_ok(status)) {
+        status = iree_status_annotate_f(
+            status,
+            "AMDGPU allocator requires fine-grained device-local memory for "
+            "host-coherent DEVICE_LOCAL|HOST_VISIBLE allocations on physical "
+            "device %" PRIhsz,
+            i);
+      }
     }
     if (iree_status_is_ok(status)) {
       status = iree_hal_amdgpu_allocator_query_pool_properties(
diff --git a/runtime/src/iree/hal/drivers/amdgpu/aql_block_processor_test.cc b/runtime/src/iree/hal/drivers/amdgpu/aql_block_processor_test.cc
index 84dd9ce..86429e4 100644
--- a/runtime/src/iree/hal/drivers/amdgpu/aql_block_processor_test.cc
+++ b/runtime/src/iree/hal/drivers/amdgpu/aql_block_processor_test.cc
@@ -505,6 +505,9 @@
 class AqlBlockProcessorRecordedTest : public ::testing::Test {
  protected:
   void SetUp() override {
+    IREE_ASSERT_OK(iree_hal_allocator_create_heap(
+        iree_make_cstring_view("aql_block_processor_test"),
+        iree_allocator_system(), iree_allocator_system(), &device_allocator_));
     iree_hal_amdgpu_profile_metadata_initialize(iree_allocator_system(),
                                                 &profile_metadata_);
     IREE_ASSERT_OK(iree_hal_amdgpu_aql_program_block_pool_initialize(
@@ -514,15 +517,18 @@
   void TearDown() override {
     iree_arena_block_pool_deinitialize(&block_pool_);
     iree_hal_amdgpu_profile_metadata_deinitialize(&profile_metadata_);
+    iree_hal_allocator_release(device_allocator_);
   }
 
   CommandBufferPtr CreateCommandBuffer(iree_host_size_t binding_capacity) {
     iree_hal_command_buffer_t* command_buffer = nullptr;
     IREE_EXPECT_OK(iree_hal_amdgpu_aql_command_buffer_create(
-        /*device_allocator=*/nullptr, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT,
+        device_allocator_, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT,
         IREE_HAL_COMMAND_CATEGORY_ANY, IREE_HAL_QUEUE_AFFINITY_ANY,
-        binding_capacity, /*device_ordinal=*/0, &profile_metadata_,
-        &block_pool_, &block_pool_, iree_allocator_system(), &command_buffer));
+        binding_capacity, /*device_ordinal=*/0,
+        iree_hal_amdgpu_aql_prepublished_kernarg_storage_disabled(),
+        &profile_metadata_, &block_pool_, &block_pool_, iree_allocator_system(),
+        &command_buffer));
     return CommandBufferPtr(command_buffer);
   }
 
@@ -540,6 +546,8 @@
   }
 
  private:
+  // Test allocator borrowed by command buffers for validation.
+  iree_hal_allocator_t* device_allocator_ = nullptr;
   // Fixed block size used by recorded command-buffer tests.
   iree_host_size_t block_size_ = 4096;
   // Program and resource-set block pool borrowed by test command buffers.
diff --git a/runtime/src/iree/hal/drivers/amdgpu/aql_command_buffer.c b/runtime/src/iree/hal/drivers/amdgpu/aql_command_buffer.c
index b778e71..14cab59 100644
--- a/runtime/src/iree/hal/drivers/amdgpu/aql_command_buffer.c
+++ b/runtime/src/iree/hal/drivers/amdgpu/aql_command_buffer.c
@@ -158,6 +158,8 @@
   } static_buffers;
   // Device-visible storage containing prepublished static dispatch kernargs.
   struct {
+    // Cold-path storage strategy selected during command-buffer creation.
+    iree_hal_amdgpu_aql_prepublished_kernarg_storage_t storage;
     // Retained buffer containing all prepublished kernarg templates.
     iree_hal_buffer_t* buffer;
     // Device pointer to the first byte of |buffer|.
@@ -241,6 +243,12 @@
 #endif  // IREE_HAL_COMMAND_BUFFER_VALIDATION_ENABLE
 }
 
+static bool iree_hal_amdgpu_aql_command_buffer_prepublish_enabled(
+    const iree_hal_amdgpu_aql_command_buffer_t* command_buffer) {
+  return command_buffer->prepublished_kernargs.storage.strategy !=
+         IREE_HAL_AMDGPU_AQL_PREPUBLISHED_KERNARG_STORAGE_STRATEGY_DISABLED;
+}
+
 static void iree_hal_amdgpu_aql_command_buffer_reset_resources(
     iree_hal_amdgpu_aql_command_buffer_t* command_buffer) {
   iree_hal_resource_set_free(command_buffer->resource_set);
@@ -600,6 +608,37 @@
 }
 
 static iree_status_t
+iree_hal_amdgpu_aql_command_buffer_verify_prepublished_kernarg_storage(
+    const iree_hal_amdgpu_aql_command_buffer_t* command_buffer,
+    iree_hal_memory_type_t required_type, iree_hal_buffer_t* buffer) {
+  const iree_hal_memory_type_t actual_type =
+      iree_hal_buffer_memory_type(buffer);
+  if (IREE_LIKELY(iree_all_bits_set(actual_type, required_type))) {
+    return iree_ok_status();
+  }
+#if IREE_STATUS_MODE
+  iree_bitfield_string_temp_t required_temp;
+  iree_bitfield_string_temp_t actual_temp;
+  const iree_string_view_t required_string =
+      iree_hal_memory_type_format(required_type, &required_temp);
+  const iree_string_view_t actual_string =
+      iree_hal_memory_type_format(actual_type, &actual_temp);
+  return iree_make_status(
+      IREE_STATUS_FAILED_PRECONDITION,
+      "prepublished command-buffer kernarg strategy %u requires "
+      "memory_type=%.*s but allocation returned memory_type=%.*s",
+      command_buffer->prepublished_kernargs.storage.strategy,
+      (int)required_string.size, required_string.data, (int)actual_string.size,
+      actual_string.data);
+#else
+  return iree_make_status(
+      IREE_STATUS_FAILED_PRECONDITION,
+      "prepublished command-buffer kernarg allocation returned incompatible "
+      "memory type");
+#endif  // IREE_STATUS_MODE
+}
+
+static iree_status_t
 iree_hal_amdgpu_aql_command_buffer_materialize_prepublished_kernargs(
     iree_hal_amdgpu_aql_command_buffer_t* command_buffer) {
   iree_host_size_t template_count = 0;
@@ -611,6 +650,13 @@
   if (template_count == 0) {
     return iree_ok_status();
   }
+  if (IREE_UNLIKELY(!iree_hal_amdgpu_aql_command_buffer_prepublish_enabled(
+          command_buffer))) {
+    return iree_make_status(
+        IREE_STATUS_FAILED_PRECONDITION,
+        "command buffer recorded prepublished kernargs without a storage "
+        "strategy");
+  }
   if (IREE_UNLIKELY(payload_length > UINT32_MAX)) {
     return iree_make_status(
         IREE_STATUS_OUT_OF_RANGE,
@@ -640,12 +686,9 @@
   IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, allocation_length);
   IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, template_count);
 
-  iree_hal_buffer_params_t params = {0};
-  params.type =
-      IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL | IREE_HAL_MEMORY_TYPE_HOST_VISIBLE;
-  params.access = IREE_HAL_MEMORY_ACCESS_ALL;
-  params.usage = IREE_HAL_BUFFER_USAGE_DISPATCH_UNIFORM_READ |
-                 IREE_HAL_BUFFER_USAGE_MAPPING;
+  iree_hal_buffer_params_t params =
+      command_buffer->prepublished_kernargs.storage.buffer_params;
+  params.queue_affinity = command_buffer->base.queue_affinity;
 
   iree_hal_buffer_t* template_buffer = NULL;
   iree_status_t status = iree_hal_allocator_allocate_buffer(
@@ -665,6 +708,11 @@
     }
   }
   if (iree_status_is_ok(status)) {
+    status =
+        iree_hal_amdgpu_aql_command_buffer_verify_prepublished_kernarg_storage(
+            command_buffer, params.type, template_buffer);
+  }
+  if (iree_status_is_ok(status)) {
     status = iree_hal_buffer_map_range(
         template_buffer, IREE_HAL_MAPPING_MODE_SCOPED,
         IREE_HAL_MEMORY_ACCESS_DISCARD_WRITE, /*byte_offset=*/0,
@@ -675,12 +723,6 @@
     status = iree_hal_amdgpu_aql_command_buffer_copy_prepublished_kernargs(
         command_buffer, &mapping, device_base);
   }
-  if (iree_status_is_ok(status) &&
-      !iree_all_bits_set(iree_hal_buffer_memory_type(template_buffer),
-                         IREE_HAL_MEMORY_TYPE_HOST_COHERENT)) {
-    status = iree_hal_buffer_mapping_flush_range(&mapping, /*byte_offset=*/0,
-                                                 IREE_HAL_WHOLE_BUFFER);
-  }
   if (mapping.buffer) {
     status = iree_status_join(status, iree_hal_buffer_unmap_range(&mapping));
   }
@@ -709,11 +751,14 @@
     iree_hal_command_category_t command_categories,
     iree_hal_queue_affinity_t queue_affinity, iree_host_size_t binding_capacity,
     iree_host_size_t device_ordinal,
+    iree_hal_amdgpu_aql_prepublished_kernarg_storage_t
+        prepublished_kernarg_storage,
     iree_hal_amdgpu_profile_metadata_registry_t* profile_metadata,
     iree_arena_block_pool_t* program_block_pool,
     iree_arena_block_pool_t* resource_set_block_pool,
     iree_allocator_t host_allocator,
     iree_hal_command_buffer_t** out_command_buffer) {
+  IREE_ASSERT_ARGUMENT(device_allocator);
   IREE_ASSERT_ARGUMENT(out_command_buffer);
   *out_command_buffer = NULL;
 
@@ -744,6 +789,16 @@
                             " exceeds uint32_t storage",
                             device_ordinal);
   }
+  switch (prepublished_kernarg_storage.strategy) {
+    case IREE_HAL_AMDGPU_AQL_PREPUBLISHED_KERNARG_STORAGE_STRATEGY_DISABLED:
+    case IREE_HAL_AMDGPU_AQL_PREPUBLISHED_KERNARG_STORAGE_STRATEGY_DEVICE_FINE_HOST_COHERENT:
+      break;
+    default:
+      return iree_make_status(
+          IREE_STATUS_INVALID_ARGUMENT,
+          "unsupported prepublished command-buffer kernarg storage strategy %u",
+          prepublished_kernarg_storage.strategy);
+  }
 
   IREE_TRACE_ZONE_BEGIN(z0);
 
@@ -771,6 +826,7 @@
   command_buffer->block_pools.resource_set = resource_set_block_pool;
   command_buffer->profile.metadata = profile_metadata;
   command_buffer->device_ordinal = (uint32_t)device_ordinal;
+  command_buffer->prepublished_kernargs.storage = prepublished_kernarg_storage;
   iree_arena_initialize(program_block_pool, &command_buffer->recording_arena);
   iree_hal_amdgpu_aql_program_builder_initialize(program_block_pool,
                                                  &command_buffer->builder);
@@ -1955,7 +2011,8 @@
   // Prepublication is a reusable-command-buffer strategy for immutable
   // kernargs. It materializes static kernargs once at end() so replay avoids
   // queue-time kernarg reservation, binding patching, and block growth.
-  if (!iree_all_bits_set(command_buffer->base.mode,
+  if (iree_hal_amdgpu_aql_command_buffer_prepublish_enabled(command_buffer) &&
+      !iree_all_bits_set(command_buffer->base.mode,
                          IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT) &&
       !uses_indirect_parameters &&
       !iree_hal_amdgpu_aql_dispatch_plan_has_dynamic_bindings(plan)) {
diff --git a/runtime/src/iree/hal/drivers/amdgpu/aql_command_buffer.h b/runtime/src/iree/hal/drivers/amdgpu/aql_command_buffer.h
index c24ba63..1cec5e8 100644
--- a/runtime/src/iree/hal/drivers/amdgpu/aql_command_buffer.h
+++ b/runtime/src/iree/hal/drivers/amdgpu/aql_command_buffer.h
@@ -10,6 +10,7 @@
 #include "iree/base/api.h"
 #include "iree/base/internal/arena.h"
 #include "iree/hal/api.h"
+#include "iree/hal/drivers/amdgpu/aql_prepublished_kernarg_storage.h"
 #include "iree/hal/drivers/amdgpu/aql_program_builder.h"
 #include "iree/hal/drivers/amdgpu/profile_metadata.h"
 
@@ -28,6 +29,8 @@
     iree_hal_command_category_t command_categories,
     iree_hal_queue_affinity_t queue_affinity, iree_host_size_t binding_capacity,
     iree_host_size_t device_ordinal,
+    iree_hal_amdgpu_aql_prepublished_kernarg_storage_t
+        prepublished_kernarg_storage,
     iree_hal_amdgpu_profile_metadata_registry_t* profile_metadata,
     iree_arena_block_pool_t* program_block_pool,
     iree_arena_block_pool_t* resource_set_block_pool,
diff --git a/runtime/src/iree/hal/drivers/amdgpu/aql_command_buffer_test.cc b/runtime/src/iree/hal/drivers/amdgpu/aql_command_buffer_test.cc
index 6190436..d423d74 100644
--- a/runtime/src/iree/hal/drivers/amdgpu/aql_command_buffer_test.cc
+++ b/runtime/src/iree/hal/drivers/amdgpu/aql_command_buffer_test.cc
@@ -29,6 +29,9 @@
 class AqlCommandBufferTest : public ::testing::Test {
  protected:
   void SetUp() override {
+    IREE_ASSERT_OK(iree_hal_allocator_create_heap(
+        iree_make_cstring_view("aql_command_buffer_test"),
+        iree_allocator_system(), iree_allocator_system(), &device_allocator_));
     iree_hal_amdgpu_profile_metadata_initialize(iree_allocator_system(),
                                                 &profile_metadata_);
     IREE_ASSERT_OK(iree_hal_amdgpu_aql_program_block_pool_initialize(
@@ -38,6 +41,7 @@
   void TearDown() override {
     iree_arena_block_pool_deinitialize(&block_pool_);
     iree_hal_amdgpu_profile_metadata_deinitialize(&profile_metadata_);
+    iree_hal_allocator_release(device_allocator_);
   }
 
   CommandBufferPtr CreateCommandBufferWithMode(
@@ -53,8 +57,9 @@
       iree_host_size_t binding_capacity = 0) {
     iree_hal_command_buffer_t* command_buffer = nullptr;
     IREE_EXPECT_OK(iree_hal_amdgpu_aql_command_buffer_create(
-        /*device_allocator=*/nullptr, mode, IREE_HAL_COMMAND_CATEGORY_ANY,
+        device_allocator_, mode, IREE_HAL_COMMAND_CATEGORY_ANY,
         IREE_HAL_QUEUE_AFFINITY_ANY, binding_capacity, /*device_ordinal=*/0,
+        iree_hal_amdgpu_aql_prepublished_kernarg_storage_disabled(),
         profile_metadata, &block_pool_, &block_pool_, iree_allocator_system(),
         &command_buffer));
     return CommandBufferPtr(command_buffer);
@@ -70,8 +75,13 @@
   }
 
  private:
+  // Test allocator borrowed by command buffers for validation.
+  iree_hal_allocator_t* device_allocator_ = nullptr;
+  // Fixed block size used by command-buffer tests.
   iree_host_size_t block_size_ = 256;
+  // Program and resource-set block pool borrowed by test command buffers.
   iree_arena_block_pool_t block_pool_;
+  // Profile metadata registry borrowed by test command buffers.
   iree_hal_amdgpu_profile_metadata_registry_t profile_metadata_;
 };
 
diff --git a/runtime/src/iree/hal/drivers/amdgpu/aql_prepublished_kernarg_storage.h b/runtime/src/iree/hal/drivers/amdgpu/aql_prepublished_kernarg_storage.h
new file mode 100644
index 0000000..8cb3bfe
--- /dev/null
+++ b/runtime/src/iree/hal/drivers/amdgpu/aql_prepublished_kernarg_storage.h
@@ -0,0 +1,60 @@
+// Copyright 2026 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_HAL_DRIVERS_AMDGPU_AQL_PREPUBLISHED_KERNARG_STORAGE_H_
+#define IREE_HAL_DRIVERS_AMDGPU_AQL_PREPUBLISHED_KERNARG_STORAGE_H_
+
+#include "iree/base/api.h"
+#include "iree/hal/api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Strategy used to materialize reusable command-buffer kernarg templates.
+typedef enum iree_hal_amdgpu_aql_prepublished_kernarg_storage_strategy_e {
+  IREE_HAL_AMDGPU_AQL_PREPUBLISHED_KERNARG_STORAGE_STRATEGY_DISABLED = 0,
+  // Device-local fine-grained memory that is CPU-visible and host-coherent.
+  IREE_HAL_AMDGPU_AQL_PREPUBLISHED_KERNARG_STORAGE_STRATEGY_DEVICE_FINE_HOST_COHERENT =
+      1,
+} iree_hal_amdgpu_aql_prepublished_kernarg_storage_strategy_t;
+
+// Storage strategy for finalized reusable command-buffer kernarg templates.
+typedef struct iree_hal_amdgpu_aql_prepublished_kernarg_storage_t {
+  // Selected backing strategy.
+  iree_hal_amdgpu_aql_prepublished_kernarg_storage_strategy_t strategy;
+  // HAL allocation parameters used for materialized kernarg storage.
+  iree_hal_buffer_params_t buffer_params;
+} iree_hal_amdgpu_aql_prepublished_kernarg_storage_t;
+
+static inline iree_hal_amdgpu_aql_prepublished_kernarg_storage_t
+iree_hal_amdgpu_aql_prepublished_kernarg_storage_disabled(void) {
+  iree_hal_amdgpu_aql_prepublished_kernarg_storage_t storage = {
+      IREE_HAL_AMDGPU_AQL_PREPUBLISHED_KERNARG_STORAGE_STRATEGY_DISABLED};
+  return storage;
+}
+
+static inline iree_hal_amdgpu_aql_prepublished_kernarg_storage_t
+iree_hal_amdgpu_aql_prepublished_kernarg_storage_device_fine_host_coherent(
+    void) {
+  iree_hal_amdgpu_aql_prepublished_kernarg_storage_t storage =
+      iree_hal_amdgpu_aql_prepublished_kernarg_storage_disabled();
+  storage.strategy =
+      IREE_HAL_AMDGPU_AQL_PREPUBLISHED_KERNARG_STORAGE_STRATEGY_DEVICE_FINE_HOST_COHERENT;
+  storage.buffer_params.type = IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL |
+                               IREE_HAL_MEMORY_TYPE_HOST_VISIBLE |
+                               IREE_HAL_MEMORY_TYPE_HOST_COHERENT;
+  storage.buffer_params.access = IREE_HAL_MEMORY_ACCESS_ALL;
+  storage.buffer_params.usage = IREE_HAL_BUFFER_USAGE_DISPATCH_UNIFORM_READ |
+                                IREE_HAL_BUFFER_USAGE_MAPPING;
+  return storage;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // IREE_HAL_DRIVERS_AMDGPU_AQL_PREPUBLISHED_KERNARG_STORAGE_H_
diff --git a/runtime/src/iree/hal/drivers/amdgpu/host_queue_command_buffer_test.cc b/runtime/src/iree/hal/drivers/amdgpu/host_queue_command_buffer_test.cc
index 7765c31..0dbd202 100644
--- a/runtime/src/iree/hal/drivers/amdgpu/host_queue_command_buffer_test.cc
+++ b/runtime/src/iree/hal/drivers/amdgpu/host_queue_command_buffer_test.cc
@@ -1447,6 +1447,36 @@
   }
 }
 
+TEST_F(HostQueueCommandBufferTest,
+       PrepublishedKernargsUseRecordedDeviceFineStorage) {
+  iree_hal_amdgpu_logical_device_options_t options;
+  iree_hal_amdgpu_logical_device_options_initialize(&options);
+  options.preallocate_pools = 0;
+
+  TestLogicalDevice test_device;
+  IREE_ASSERT_OK(
+      test_device.Initialize(&options, &libhsa_, &topology_, host_allocator_));
+
+  iree_hal_amdgpu_logical_device_t* logical_device =
+      test_device.logical_device();
+  ASSERT_GT(logical_device->physical_device_count, 0u);
+  const iree_hal_amdgpu_aql_prepublished_kernarg_storage_t* storage =
+      &logical_device->physical_devices[0]->prepublished_kernarg_storage;
+
+  EXPECT_EQ(
+      storage->strategy,
+      IREE_HAL_AMDGPU_AQL_PREPUBLISHED_KERNARG_STORAGE_STRATEGY_DEVICE_FINE_HOST_COHERENT);
+  EXPECT_TRUE(iree_all_bits_set(storage->buffer_params.type,
+                                IREE_HAL_MEMORY_TYPE_DEVICE_LOCAL |
+                                    IREE_HAL_MEMORY_TYPE_HOST_VISIBLE |
+                                    IREE_HAL_MEMORY_TYPE_HOST_COHERENT));
+  EXPECT_TRUE(iree_all_bits_set(storage->buffer_params.access,
+                                IREE_HAL_MEMORY_ACCESS_ALL));
+  EXPECT_TRUE(iree_all_bits_set(storage->buffer_params.usage,
+                                IREE_HAL_BUFFER_USAGE_DISPATCH_UNIFORM_READ |
+                                    IREE_HAL_BUFFER_USAGE_MAPPING));
+}
+
 TEST_F(HostQueueCommandBufferTest, DirectDispatchUsesPrepublishedKernargs) {
   iree_hal_amdgpu_logical_device_options_t options;
   iree_hal_amdgpu_logical_device_options_initialize(&options);
diff --git a/runtime/src/iree/hal/drivers/amdgpu/logical_device.c b/runtime/src/iree/hal/drivers/amdgpu/logical_device.c
index f715c8a..8270f80 100644
--- a/runtime/src/iree/hal/drivers/amdgpu/logical_device.c
+++ b/runtime/src/iree/hal/drivers/amdgpu/logical_device.c
@@ -2146,9 +2146,12 @@
       iree_hal_amdgpu_logical_device_normalize_command_buffer_affinity(
           logical_device, queue_affinity, &effective_queue_affinity,
           &device_ordinal));
+  const iree_hal_amdgpu_physical_device_t* physical_device =
+      logical_device->physical_devices[device_ordinal];
   return iree_hal_amdgpu_aql_command_buffer_create(
       iree_hal_device_allocator(base_device), mode, command_categories,
       effective_queue_affinity, binding_capacity, device_ordinal,
+      physical_device->prepublished_kernarg_storage,
       &logical_device->profile_metadata,
       &logical_device->host_block_pools.command_buffer,
       &logical_device->host_block_pools.small, logical_device->host_allocator,
diff --git a/runtime/src/iree/hal/drivers/amdgpu/physical_device.c b/runtime/src/iree/hal/drivers/amdgpu/physical_device.c
index 121d1c9..e0f3ae5 100644
--- a/runtime/src/iree/hal/drivers/amdgpu/physical_device.c
+++ b/runtime/src/iree/hal/drivers/amdgpu/physical_device.c
@@ -498,10 +498,27 @@
     iree_hal_amdgpu_libhsa_t* libhsa, hsa_agent_t device_agent,
     hsa_amd_memory_pool_t* out_coarse_block_memory_pool,
     hsa_amd_memory_pool_t* out_fine_block_memory_pool) {
-  IREE_RETURN_IF_ERROR(iree_hal_amdgpu_find_coarse_global_memory_pool(
-      libhsa, device_agent, out_coarse_block_memory_pool));
-  return iree_hal_amdgpu_find_fine_global_memory_pool(
-      libhsa, device_agent, out_fine_block_memory_pool);
+  iree_status_t status = iree_hal_amdgpu_find_coarse_global_memory_pool(
+      libhsa, device_agent, out_coarse_block_memory_pool);
+  if (iree_status_is_ok(status)) {
+    status = iree_hal_amdgpu_find_fine_global_memory_pool(
+        libhsa, device_agent, out_fine_block_memory_pool);
+  }
+  if (!iree_status_is_ok(status)) {
+    status = iree_status_annotate(
+        status, IREE_SV("AMDGPU physical device requires coarse and fine "
+                        "device-local global memory pools"));
+  }
+  return status;
+}
+
+static iree_hal_amdgpu_aql_prepublished_kernarg_storage_t
+iree_hal_amdgpu_physical_device_select_prepublished_kernarg_storage(
+    hsa_amd_memory_pool_t fine_block_memory_pool) {
+  if (!fine_block_memory_pool.handle) {
+    return iree_hal_amdgpu_aql_prepublished_kernarg_storage_disabled();
+  }
+  return iree_hal_amdgpu_aql_prepublished_kernarg_storage_device_fine_host_coherent();
 }
 
 typedef struct iree_hal_amdgpu_physical_device_kernarg_ring_memory_t {
@@ -970,6 +987,11 @@
         &fine_block_memory_pool);
   }
   if (iree_status_is_ok(status)) {
+    out_physical_device->prepublished_kernarg_storage =
+        iree_hal_amdgpu_physical_device_select_prepublished_kernarg_storage(
+            fine_block_memory_pool);
+  }
+  if (iree_status_is_ok(status)) {
     status = iree_hal_amdgpu_physical_device_preallocate_host_pool(
         options, out_physical_device);
   }
diff --git a/runtime/src/iree/hal/drivers/amdgpu/physical_device.h b/runtime/src/iree/hal/drivers/amdgpu/physical_device.h
index eab38f3..9d0afce 100644
--- a/runtime/src/iree/hal/drivers/amdgpu/physical_device.h
+++ b/runtime/src/iree/hal/drivers/amdgpu/physical_device.h
@@ -9,6 +9,7 @@
 
 #include "iree/base/api.h"
 #include "iree/base/internal/arena.h"
+#include "iree/hal/drivers/amdgpu/aql_prepublished_kernarg_storage.h"
 #include "iree/hal/drivers/amdgpu/buffer.h"
 #include "iree/hal/drivers/amdgpu/host_queue.h"
 #include "iree/hal/drivers/amdgpu/host_queue_staging.h"
@@ -226,6 +227,9 @@
   // CPU-visible coarse-grained device-memory capability for this GPU.
   iree_hal_amdgpu_cpu_visible_device_coarse_memory_t
       cpu_visible_device_coarse_memory;
+  // Prepublished command-buffer kernarg storage capability for this GPU.
+  iree_hal_amdgpu_aql_prepublished_kernarg_storage_t
+      prepublished_kernarg_storage;
 
   // Fine-grained block pools for device memory blocks of various sizes.
   iree_hal_amdgpu_block_pools_t fine_block_pools;