[HAL/AMDGPU] Compact dynamic binding pointer replay

Dynamic command-buffer replay was still caching resolved binding pointers in a sparse array indexed by the original queue_execute binding slot. That kept replay scratch storage tied to the command buffer maximum binding slot even after finalization had built a compact per-block dynamic slot sidecar.

Rewrite dynamic dispatch binding-source slots at command-buffer finalization to dense sidecar ordinals. The sidecar continues to store the original queue_execute binding slots, while host replay resolves only those used slots into a compact pointer table consumed by both the base and profiling block processors.

Static and fully prepublished blocks keep the same no-sidecar path, while dynamic replay gets a denser host-side representation for future queue-upload and device-fixup publication.
diff --git a/runtime/src/iree/hal/drivers/amdgpu/abi/command_buffer.h b/runtime/src/iree/hal/drivers/amdgpu/abi/command_buffer.h
index 83a3d91..6b44774 100644
--- a/runtime/src/iree/hal/drivers/amdgpu/abi/command_buffer.h
+++ b/runtime/src/iree/hal/drivers/amdgpu/abi/command_buffer.h
@@ -224,11 +224,18 @@
     iree_hal_amdgpu_command_buffer_binding_source_t {
   // Static raw source: final raw device pointer.
   //
-  // Dynamic or static-buffer source: byte offset added to the queue_execute
-  // binding table slot or command-buffer static buffer ordinal in |slot|.
+  // Dynamic source: byte offset added to the resolved pointer table entry in
+  // |slot|. The block dynamic-binding-slot sidecar maps that dense table entry
+  // back to the original queue_execute binding table slot.
+  //
+  // Static-buffer source: byte offset added to the command-buffer static buffer
+  // ordinal in |slot|.
   uint64_t offset_or_pointer;
-  // Dynamic source binding table slot or static buffer ordinal. Must be zero
-  // for raw static sources.
+  // Dynamic source dense resolved pointer table ordinal or static buffer
+  // ordinal. Dynamic indirect-parameter sources keep the original queue_execute
+  // binding table slot because they are resolved directly from the binding
+  // table instead of the dispatch kernarg pointer cache. Must be zero for raw
+  // static sources.
   uint32_t slot;
   // Destination HAL ABI binding pointer ordinal for compact patch lists.
   uint16_t target_binding_ordinal;
diff --git a/runtime/src/iree/hal/drivers/amdgpu/aql_block_processor.h b/runtime/src/iree/hal/drivers/amdgpu/aql_block_processor.h
index 87d7736..074319d 100644
--- a/runtime/src/iree/hal/drivers/amdgpu/aql_block_processor.h
+++ b/runtime/src/iree/hal/drivers/amdgpu/aql_block_processor.h
@@ -42,7 +42,7 @@
   struct {
     // Binding table supplied to queue_execute.
     iree_hal_buffer_binding_table_t table;
-    // Pre-resolved dynamic binding pointers indexed by binding slot.
+    // Pre-resolved dynamic binding pointers indexed by block sidecar ordinal.
     const uint64_t* ptrs;
   } bindings;
   // Reserved packet span populated by the processor.
diff --git a/runtime/src/iree/hal/drivers/amdgpu/aql_block_processor_profile.h b/runtime/src/iree/hal/drivers/amdgpu/aql_block_processor_profile.h
index fccbfce..924fc5e 100644
--- a/runtime/src/iree/hal/drivers/amdgpu/aql_block_processor_profile.h
+++ b/runtime/src/iree/hal/drivers/amdgpu/aql_block_processor_profile.h
@@ -68,7 +68,7 @@
   struct {
     // Binding table supplied to queue_execute.
     iree_hal_buffer_binding_table_t table;
-    // Pre-resolved dynamic binding pointers indexed by binding slot.
+    // Pre-resolved dynamic binding pointers indexed by block sidecar ordinal.
     const uint64_t* ptrs;
   } bindings;
   // Reserved packet span populated by profiled replay.
diff --git a/runtime/src/iree/hal/drivers/amdgpu/aql_command_buffer.c b/runtime/src/iree/hal/drivers/amdgpu/aql_command_buffer.c
index e3f89cd..c70eaa7 100644
--- a/runtime/src/iree/hal/drivers/amdgpu/aql_command_buffer.c
+++ b/runtime/src/iree/hal/drivers/amdgpu/aql_command_buffer.c
@@ -767,13 +767,17 @@
              0;
 }
 
-static bool
-iree_hal_amdgpu_aql_command_buffer_dynamic_binding_slot_list_contains(
-    const uint32_t* values, uint16_t count, uint32_t slot) {
+static uint16_t
+iree_hal_amdgpu_aql_command_buffer_find_or_append_dynamic_binding_slot(
+    uint32_t* values, uint16_t* inout_count, uint16_t capacity, uint32_t slot) {
+  const uint16_t count = *inout_count;
   for (uint16_t i = 0; i < count; ++i) {
-    if (values[i] == slot) return true;
+    if (values[i] == slot) return i;
   }
-  return false;
+  IREE_ASSERT(count < capacity);
+  values[count] = slot;
+  *inout_count = count + 1;
+  return count;
 }
 
 static iree_status_t
@@ -798,10 +802,10 @@
       (const uint32_t*)((uint8_t*)slot_block + values_offset);
   uint32_t* slot_values = (uint32_t*)((uint8_t*)slot_block + values_offset);
 
-  const iree_hal_amdgpu_command_buffer_binding_source_t* binding_sources =
-      iree_hal_amdgpu_command_buffer_block_binding_sources_const(block);
+  iree_hal_amdgpu_command_buffer_binding_source_t* binding_sources =
+      iree_hal_amdgpu_command_buffer_block_binding_sources(block);
   for (uint16_t i = 0; i < block->binding_source_count; ++i) {
-    const iree_hal_amdgpu_command_buffer_binding_source_t* binding_source =
+    iree_hal_amdgpu_command_buffer_binding_source_t* binding_source =
         &binding_sources[i];
     if (!iree_hal_amdgpu_aql_command_buffer_binding_source_uses_dynamic_binding_slot(
             binding_source)) {
@@ -816,11 +820,10 @@
                               block->block_ordinal, binding_source->slot,
                               command_buffer->base.binding_count);
     }
-    if (iree_hal_amdgpu_aql_command_buffer_dynamic_binding_slot_list_contains(
-            slot_values, slot_block->slots.count, binding_source->slot)) {
-      continue;
-    }
-    slot_values[slot_block->slots.count++] = binding_source->slot;
+    binding_source->slot =
+        iree_hal_amdgpu_aql_command_buffer_find_or_append_dynamic_binding_slot(
+            slot_values, &slot_block->slots.count, dynamic_source_count,
+            binding_source->slot);
   }
   if (slot_block->slots.count == 0) return iree_ok_status();
   if (IREE_UNLIKELY(command_buffer->dynamic_binding_slots.count >
diff --git a/runtime/src/iree/hal/drivers/amdgpu/aql_command_buffer.h b/runtime/src/iree/hal/drivers/amdgpu/aql_command_buffer.h
index 6f74a83..26a99e6 100644
--- a/runtime/src/iree/hal/drivers/amdgpu/aql_command_buffer.h
+++ b/runtime/src/iree/hal/drivers/amdgpu/aql_command_buffer.h
@@ -78,7 +78,9 @@
 
 // Dynamic queue_execute binding slots used by one finalized block.
 typedef struct iree_hal_amdgpu_aql_command_buffer_dynamic_binding_slots_t {
-  // Binding table slots resolved into raw device pointers before block replay.
+  // Queue_execute binding table slots resolved into the dense raw pointer table
+  // before block replay. Dynamic binding-source records index this array by
+  // dense ordinal, not by original queue_execute binding slot.
   const uint32_t* values;
   // Number of entries in |values|.
   uint16_t count;
diff --git a/runtime/src/iree/hal/drivers/amdgpu/host_queue_command_buffer_block.c b/runtime/src/iree/hal/drivers/amdgpu/host_queue_command_buffer_block.c
index 7bb8704..0c40c9a 100644
--- a/runtime/src/iree/hal/drivers/amdgpu/host_queue_command_buffer_block.c
+++ b/runtime/src/iree/hal/drivers/amdgpu/host_queue_command_buffer_block.c
@@ -112,7 +112,7 @@
   }
 
   uint64_t* binding_ptrs = NULL;
-  if (command_buffer->binding_count <=
+  if (dynamic_binding_slots.count <=
       IREE_HAL_AMDGPU_HOST_QUEUE_COMMAND_BUFFER_BINDING_SCRATCH_CAPACITY) {
     IREE_RETURN_IF_ERROR(
         iree_hal_amdgpu_host_queue_ensure_command_buffer_scratch(queue));
@@ -121,9 +121,9 @@
     iree_host_size_t binding_ptr_bytes = 0;
     IREE_RETURN_IF_ERROR(IREE_STRUCT_LAYOUT(
         0, &binding_ptr_bytes,
-        IREE_STRUCT_FIELD(command_buffer->binding_count, uint64_t, NULL)));
+        IREE_STRUCT_FIELD(dynamic_binding_slots.count, uint64_t, NULL)));
     IREE_TRACE_ZONE_BEGIN(z0);
-    IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, command_buffer->binding_count);
+    IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, dynamic_binding_slots.count);
     IREE_RETURN_AND_END_ZONE_IF_ERROR(
         z0, iree_arena_allocate(overflow_arena, binding_ptr_bytes,
                                 (void**)&binding_ptrs));
@@ -134,13 +134,6 @@
   for (uint16_t i = 0;
        i < dynamic_binding_slots.count && iree_status_is_ok(status); ++i) {
     const uint32_t slot = dynamic_binding_slots.values[i];
-    if (IREE_UNLIKELY(slot >= command_buffer->binding_count)) {
-      status = iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
-                                "command-buffer binding slot %" PRIu32
-                                " exceeds binding count %u",
-                                slot, command_buffer->binding_count);
-      break;
-    }
     if (IREE_UNLIKELY(slot >= binding_table.count)) {
       status = iree_make_status(IREE_STATUS_INVALID_ARGUMENT,
                                 "queue_execute binding table slot %" PRIu32
@@ -149,7 +142,7 @@
       break;
     }
     status = iree_hal_amdgpu_host_queue_resolve_dispatch_binding_ptr(
-        &binding_table.bindings[slot], &binding_ptrs[slot]);
+        &binding_table.bindings[slot], &binding_ptrs[i]);
     if (!iree_status_is_ok(status)) {
       status =
           iree_status_annotate_f(status, "binding_table[%" PRIu32 "]", slot);
diff --git a/runtime/src/iree/hal/drivers/amdgpu/host_queue_command_buffer_scratch.h b/runtime/src/iree/hal/drivers/amdgpu/host_queue_command_buffer_scratch.h
index 69a95c7..65d700e 100644
--- a/runtime/src/iree/hal/drivers/amdgpu/host_queue_command_buffer_scratch.h
+++ b/runtime/src/iree/hal/drivers/amdgpu/host_queue_command_buffer_scratch.h
@@ -9,9 +9,9 @@
 
 #include "iree/base/api.h"
 
-// Queue_execute binding table prefix cached as raw device pointers under
-// submission_mutex while replaying an AQL command buffer. Larger tables use a
-// temporary arena block for the current submission.
+// Dense command-buffer dynamic binding slots cached as raw device pointers
+// under submission_mutex while replaying an AQL command buffer. Larger blocks
+// use temporary arena storage for the current submission.
 #define IREE_HAL_AMDGPU_HOST_QUEUE_COMMAND_BUFFER_BINDING_SCRATCH_CAPACITY 4096u
 
 // Queue_execute packet metadata cached under submission_mutex while replaying
@@ -23,7 +23,7 @@
 typedef struct iree_hal_amdgpu_host_queue_command_buffer_scratch_t {
   // Resolved queue_execute binding-table device pointers.
   struct {
-    // Raw device pointers indexed by queue_execute binding slot.
+    // Raw device pointers indexed by dynamic binding sidecar ordinal.
     uint64_t ptrs
         [IREE_HAL_AMDGPU_HOST_QUEUE_COMMAND_BUFFER_BINDING_SCRATCH_CAPACITY];
   } bindings;
diff --git a/runtime/src/iree/hal/drivers/amdgpu/host_queue_command_buffer_test.cc b/runtime/src/iree/hal/drivers/amdgpu/host_queue_command_buffer_test.cc
index b0ad512..1ebd31c 100644
--- a/runtime/src/iree/hal/drivers/amdgpu/host_queue_command_buffer_test.cc
+++ b/runtime/src/iree/hal/drivers/amdgpu/host_queue_command_buffer_test.cc
@@ -1728,7 +1728,7 @@
       iree_hal_make_buffer_ref(input_buffer, /*offset=*/0,
                                iree_hal_buffer_byte_length(input_buffer)),
       iree_hal_make_indirect_buffer_ref(
-          /*buffer_slot=*/0, /*offset=*/0,
+          /*buffer_slot=*/3, /*offset=*/0,
           iree_hal_buffer_byte_length(output_buffer)),
   };
   const iree_hal_buffer_ref_list_t dispatch_bindings = {
@@ -1745,7 +1745,7 @@
       IREE_HAL_COMMAND_BUFFER_MODE_DEFAULT |
           IREE_HAL_COMMAND_BUFFER_MODE_RETAIN_PROFILE_METADATA,
       IREE_HAL_COMMAND_CATEGORY_DISPATCH, IREE_HAL_QUEUE_AFFINITY_ANY,
-      /*binding_capacity=*/1, command_buffer.out()));
+      /*binding_capacity=*/4, command_buffer.out()));
   IREE_ASSERT_OK(iree_hal_command_buffer_begin(command_buffer));
   IREE_ASSERT_OK(iree_hal_command_buffer_dispatch(
       command_buffer, executable, /*entry_point=*/0,
@@ -1764,13 +1764,15 @@
           iree_hal_amdgpu_aql_command_buffer_dynamic_binding_slots(
               command_buffer, program->first_block);
   ASSERT_EQ(dynamic_binding_slots.count, 1u);
-  EXPECT_EQ(dynamic_binding_slots.values[0], 0u);
+  EXPECT_EQ(dynamic_binding_slots.values[0], 3u);
   ASSERT_EQ(program->first_block->binding_source_count, 1u);
   const iree_hal_amdgpu_command_buffer_binding_source_t* binding_source =
       iree_hal_amdgpu_command_buffer_block_binding_sources_const(
           program->first_block);
   EXPECT_EQ(binding_source->flags,
             IREE_HAL_AMDGPU_COMMAND_BUFFER_BINDING_SOURCE_FLAG_DYNAMIC);
+  // Dynamic binding-source records index the dense resolved pointer table. The
+  // sidecar above maps ordinal 0 back to queue_execute binding table slot 3.
   EXPECT_EQ(binding_source->slot, 0u);
   EXPECT_EQ(binding_source->target_binding_ordinal, 1u);
 
@@ -1821,14 +1823,31 @@
       /*semaphores=*/&command_buffer_signal_ptr,
       /*payload_values=*/&command_buffer_signal_value,
   };
-  iree_hal_buffer_binding_t binding = {
-      /*buffer=*/output_buffer.get(),
-      /*offset=*/0,
-      /*length=*/IREE_HAL_WHOLE_BUFFER,
+  iree_hal_buffer_binding_t bindings[4] = {
+      {
+          /*buffer=*/input_buffer.get(),
+          /*offset=*/0,
+          /*length=*/IREE_HAL_WHOLE_BUFFER,
+      },
+      {
+          /*buffer=*/input_buffer.get(),
+          /*offset=*/0,
+          /*length=*/IREE_HAL_WHOLE_BUFFER,
+      },
+      {
+          /*buffer=*/input_buffer.get(),
+          /*offset=*/0,
+          /*length=*/IREE_HAL_WHOLE_BUFFER,
+      },
+      {
+          /*buffer=*/output_buffer.get(),
+          /*offset=*/0,
+          /*length=*/IREE_HAL_WHOLE_BUFFER,
+      },
   };
   const iree_hal_buffer_binding_table_t binding_table = {
-      /*count=*/1,
-      /*bindings=*/&binding,
+      /*count=*/IREE_ARRAYSIZE(bindings),
+      /*bindings=*/bindings,
   };
   IREE_ASSERT_OK(iree_hal_device_queue_execute(
       test_device.base_device(), IREE_HAL_QUEUE_AFFINITY_ANY,