[HAL/AMDGPU] Compact dynamic binding pointer replay Dynamic command-buffer replay was still caching resolved binding pointers in a sparse array indexed by the original queue_execute binding slot. That kept replay scratch storage tied to the command buffer maximum binding slot even after finalization had built a compact per-block dynamic slot sidecar. Rewrite dynamic dispatch binding-source slots at command-buffer finalization to dense sidecar ordinals. The sidecar continues to store the original queue_execute binding slots, while host replay resolves only those used slots into a compact pointer table consumed by both the base and profiling block processors. Static and fully prepublished blocks keep the same no-sidecar path, while dynamic replay gets a denser host-side representation for future queue-upload and device-fixup publication.
diff --git a/runtime/src/iree/hal/drivers/amdgpu/abi/command_buffer.h b/runtime/src/iree/hal/drivers/amdgpu/abi/command_buffer.h index 83a3d91..6b44774 100644 --- a/runtime/src/iree/hal/drivers/amdgpu/abi/command_buffer.h +++ b/runtime/src/iree/hal/drivers/amdgpu/abi/command_buffer.h
@@ -224,11 +224,18 @@ iree_hal_amdgpu_command_buffer_binding_source_t { // Static raw source: final raw device pointer. // - // Dynamic or static-buffer source: byte offset added to the queue_execute - // binding table slot or command-buffer static buffer ordinal in |slot|. + // Dynamic source: byte offset added to the resolved pointer table entry in + // |slot|. The block dynamic-binding-slot sidecar maps that dense table entry + // back to the original queue_execute binding table slot. + // + // Static-buffer source: byte offset added to the command-buffer static buffer + // ordinal in |slot|. uint64_t offset_or_pointer; - // Dynamic source binding table slot or static buffer ordinal. Must be zero - // for raw static sources. + // Dynamic source dense resolved pointer table ordinal or static buffer + // ordinal. Dynamic indirect-parameter sources keep the original queue_execute + // binding table slot because they are resolved directly from the binding + // table instead of the dispatch kernarg pointer cache. Must be zero for raw + // static sources. uint32_t slot; // Destination HAL ABI binding pointer ordinal for compact patch lists. uint16_t target_binding_ordinal;
diff --git a/runtime/src/iree/hal/drivers/amdgpu/aql_block_processor.h b/runtime/src/iree/hal/drivers/amdgpu/aql_block_processor.h index 87d7736..074319d 100644 --- a/runtime/src/iree/hal/drivers/amdgpu/aql_block_processor.h +++ b/runtime/src/iree/hal/drivers/amdgpu/aql_block_processor.h
@@ -42,7 +42,7 @@ struct { // Binding table supplied to queue_execute. iree_hal_buffer_binding_table_t table; - // Pre-resolved dynamic binding pointers indexed by binding slot. + // Pre-resolved dynamic binding pointers indexed by block sidecar ordinal. const uint64_t* ptrs; } bindings; // Reserved packet span populated by the processor.
diff --git a/runtime/src/iree/hal/drivers/amdgpu/aql_block_processor_profile.h b/runtime/src/iree/hal/drivers/amdgpu/aql_block_processor_profile.h index fccbfce..924fc5e 100644 --- a/runtime/src/iree/hal/drivers/amdgpu/aql_block_processor_profile.h +++ b/runtime/src/iree/hal/drivers/amdgpu/aql_block_processor_profile.h
@@ -68,7 +68,7 @@ struct { // Binding table supplied to queue_execute. iree_hal_buffer_binding_table_t table; - // Pre-resolved dynamic binding pointers indexed by binding slot. + // Pre-resolved dynamic binding pointers indexed by block sidecar ordinal. const uint64_t* ptrs; } bindings; // Reserved packet span populated by profiled replay.
diff --git a/runtime/src/iree/hal/drivers/amdgpu/aql_command_buffer.c b/runtime/src/iree/hal/drivers/amdgpu/aql_command_buffer.c index e3f89cd..c70eaa7 100644 --- a/runtime/src/iree/hal/drivers/amdgpu/aql_command_buffer.c +++ b/runtime/src/iree/hal/drivers/amdgpu/aql_command_buffer.c
@@ -767,13 +767,17 @@ 0; } -static bool -iree_hal_amdgpu_aql_command_buffer_dynamic_binding_slot_list_contains( - const uint32_t* values, uint16_t count, uint32_t slot) { +static uint16_t +iree_hal_amdgpu_aql_command_buffer_find_or_append_dynamic_binding_slot( + uint32_t* values, uint16_t* inout_count, uint16_t capacity, uint32_t slot) { + const uint16_t count = *inout_count; for (uint16_t i = 0; i < count; ++i) { - if (values[i] == slot) return true; + if (values[i] == slot) return i; } - return false; + IREE_ASSERT(count < capacity); + values[count] = slot; + *inout_count = count + 1; + return count; } static iree_status_t @@ -798,10 +802,10 @@ (const uint32_t*)((uint8_t*)slot_block + values_offset); uint32_t* slot_values = (uint32_t*)((uint8_t*)slot_block + values_offset); - const iree_hal_amdgpu_command_buffer_binding_source_t* binding_sources = - iree_hal_amdgpu_command_buffer_block_binding_sources_const(block); + iree_hal_amdgpu_command_buffer_binding_source_t* binding_sources = + iree_hal_amdgpu_command_buffer_block_binding_sources(block); for (uint16_t i = 0; i < block->binding_source_count; ++i) { - const iree_hal_amdgpu_command_buffer_binding_source_t* binding_source = + iree_hal_amdgpu_command_buffer_binding_source_t* binding_source = &binding_sources[i]; if (!iree_hal_amdgpu_aql_command_buffer_binding_source_uses_dynamic_binding_slot( binding_source)) { @@ -816,11 +820,10 @@ block->block_ordinal, binding_source->slot, command_buffer->base.binding_count); } - if (iree_hal_amdgpu_aql_command_buffer_dynamic_binding_slot_list_contains( - slot_values, slot_block->slots.count, binding_source->slot)) { - continue; - } - slot_values[slot_block->slots.count++] = binding_source->slot; + binding_source->slot = + iree_hal_amdgpu_aql_command_buffer_find_or_append_dynamic_binding_slot( + slot_values, &slot_block->slots.count, dynamic_source_count, + binding_source->slot); } if (slot_block->slots.count == 0) return iree_ok_status(); if (IREE_UNLIKELY(command_buffer->dynamic_binding_slots.count >
diff --git a/runtime/src/iree/hal/drivers/amdgpu/aql_command_buffer.h b/runtime/src/iree/hal/drivers/amdgpu/aql_command_buffer.h index 6f74a83..26a99e6 100644 --- a/runtime/src/iree/hal/drivers/amdgpu/aql_command_buffer.h +++ b/runtime/src/iree/hal/drivers/amdgpu/aql_command_buffer.h
@@ -78,7 +78,9 @@ // Dynamic queue_execute binding slots used by one finalized block. typedef struct iree_hal_amdgpu_aql_command_buffer_dynamic_binding_slots_t { - // Binding table slots resolved into raw device pointers before block replay. + // Queue_execute binding table slots resolved into the dense raw pointer table + // before block replay. Dynamic binding-source records index this array by + // dense ordinal, not by original queue_execute binding slot. const uint32_t* values; // Number of entries in |values|. uint16_t count;
diff --git a/runtime/src/iree/hal/drivers/amdgpu/host_queue_command_buffer_block.c b/runtime/src/iree/hal/drivers/amdgpu/host_queue_command_buffer_block.c index 7bb8704..0c40c9a 100644 --- a/runtime/src/iree/hal/drivers/amdgpu/host_queue_command_buffer_block.c +++ b/runtime/src/iree/hal/drivers/amdgpu/host_queue_command_buffer_block.c
@@ -112,7 +112,7 @@ } uint64_t* binding_ptrs = NULL; - if (command_buffer->binding_count <= + if (dynamic_binding_slots.count <= IREE_HAL_AMDGPU_HOST_QUEUE_COMMAND_BUFFER_BINDING_SCRATCH_CAPACITY) { IREE_RETURN_IF_ERROR( iree_hal_amdgpu_host_queue_ensure_command_buffer_scratch(queue)); @@ -121,9 +121,9 @@ iree_host_size_t binding_ptr_bytes = 0; IREE_RETURN_IF_ERROR(IREE_STRUCT_LAYOUT( 0, &binding_ptr_bytes, - IREE_STRUCT_FIELD(command_buffer->binding_count, uint64_t, NULL))); + IREE_STRUCT_FIELD(dynamic_binding_slots.count, uint64_t, NULL))); IREE_TRACE_ZONE_BEGIN(z0); - IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, command_buffer->binding_count); + IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, dynamic_binding_slots.count); IREE_RETURN_AND_END_ZONE_IF_ERROR( z0, iree_arena_allocate(overflow_arena, binding_ptr_bytes, (void**)&binding_ptrs)); @@ -134,13 +134,6 @@ for (uint16_t i = 0; i < dynamic_binding_slots.count && iree_status_is_ok(status); ++i) { const uint32_t slot = dynamic_binding_slots.values[i]; - if (IREE_UNLIKELY(slot >= command_buffer->binding_count)) { - status = iree_make_status(IREE_STATUS_INVALID_ARGUMENT, - "command-buffer binding slot %" PRIu32 - " exceeds binding count %u", - slot, command_buffer->binding_count); - break; - } if (IREE_UNLIKELY(slot >= binding_table.count)) { status = iree_make_status(IREE_STATUS_INVALID_ARGUMENT, "queue_execute binding table slot %" PRIu32 @@ -149,7 +142,7 @@ break; } status = iree_hal_amdgpu_host_queue_resolve_dispatch_binding_ptr( - &binding_table.bindings[slot], &binding_ptrs[slot]); + &binding_table.bindings[slot], &binding_ptrs[i]); if (!iree_status_is_ok(status)) { status = iree_status_annotate_f(status, "binding_table[%" PRIu32 "]", slot);
diff --git a/runtime/src/iree/hal/drivers/amdgpu/host_queue_command_buffer_scratch.h b/runtime/src/iree/hal/drivers/amdgpu/host_queue_command_buffer_scratch.h index 69a95c7..65d700e 100644 --- a/runtime/src/iree/hal/drivers/amdgpu/host_queue_command_buffer_scratch.h +++ b/runtime/src/iree/hal/drivers/amdgpu/host_queue_command_buffer_scratch.h
@@ -9,9 +9,9 @@ #include "iree/base/api.h" -// Queue_execute binding table prefix cached as raw device pointers under -// submission_mutex while replaying an AQL command buffer. Larger tables use a -// temporary arena block for the current submission. +// Dense command-buffer dynamic binding slots cached as raw device pointers +// under submission_mutex while replaying an AQL command buffer. Larger blocks +// use temporary arena storage for the current submission. #define IREE_HAL_AMDGPU_HOST_QUEUE_COMMAND_BUFFER_BINDING_SCRATCH_CAPACITY 4096u // Queue_execute packet metadata cached under submission_mutex while replaying @@ -23,7 +23,7 @@ typedef struct iree_hal_amdgpu_host_queue_command_buffer_scratch_t { // Resolved queue_execute binding-table device pointers. struct { - // Raw device pointers indexed by queue_execute binding slot. + // Raw device pointers indexed by dynamic binding sidecar ordinal. uint64_t ptrs [IREE_HAL_AMDGPU_HOST_QUEUE_COMMAND_BUFFER_BINDING_SCRATCH_CAPACITY]; } bindings;
diff --git a/runtime/src/iree/hal/drivers/amdgpu/host_queue_command_buffer_test.cc b/runtime/src/iree/hal/drivers/amdgpu/host_queue_command_buffer_test.cc index b0ad512..1ebd31c 100644 --- a/runtime/src/iree/hal/drivers/amdgpu/host_queue_command_buffer_test.cc +++ b/runtime/src/iree/hal/drivers/amdgpu/host_queue_command_buffer_test.cc
@@ -1728,7 +1728,7 @@ iree_hal_make_buffer_ref(input_buffer, /*offset=*/0, iree_hal_buffer_byte_length(input_buffer)), iree_hal_make_indirect_buffer_ref( - /*buffer_slot=*/0, /*offset=*/0, + /*buffer_slot=*/3, /*offset=*/0, iree_hal_buffer_byte_length(output_buffer)), }; const iree_hal_buffer_ref_list_t dispatch_bindings = { @@ -1745,7 +1745,7 @@ IREE_HAL_COMMAND_BUFFER_MODE_DEFAULT | IREE_HAL_COMMAND_BUFFER_MODE_RETAIN_PROFILE_METADATA, IREE_HAL_COMMAND_CATEGORY_DISPATCH, IREE_HAL_QUEUE_AFFINITY_ANY, - /*binding_capacity=*/1, command_buffer.out())); + /*binding_capacity=*/4, command_buffer.out())); IREE_ASSERT_OK(iree_hal_command_buffer_begin(command_buffer)); IREE_ASSERT_OK(iree_hal_command_buffer_dispatch( command_buffer, executable, /*entry_point=*/0, @@ -1764,13 +1764,15 @@ iree_hal_amdgpu_aql_command_buffer_dynamic_binding_slots( command_buffer, program->first_block); ASSERT_EQ(dynamic_binding_slots.count, 1u); - EXPECT_EQ(dynamic_binding_slots.values[0], 0u); + EXPECT_EQ(dynamic_binding_slots.values[0], 3u); ASSERT_EQ(program->first_block->binding_source_count, 1u); const iree_hal_amdgpu_command_buffer_binding_source_t* binding_source = iree_hal_amdgpu_command_buffer_block_binding_sources_const( program->first_block); EXPECT_EQ(binding_source->flags, IREE_HAL_AMDGPU_COMMAND_BUFFER_BINDING_SOURCE_FLAG_DYNAMIC); + // Dynamic binding-source records index the dense resolved pointer table. The + // sidecar above maps ordinal 0 back to queue_execute binding table slot 3. EXPECT_EQ(binding_source->slot, 0u); EXPECT_EQ(binding_source->target_binding_ordinal, 1u); @@ -1821,14 +1823,31 @@ /*semaphores=*/&command_buffer_signal_ptr, /*payload_values=*/&command_buffer_signal_value, }; - iree_hal_buffer_binding_t binding = { - /*buffer=*/output_buffer.get(), - /*offset=*/0, - /*length=*/IREE_HAL_WHOLE_BUFFER, + iree_hal_buffer_binding_t bindings[4] = { + { + /*buffer=*/input_buffer.get(), + /*offset=*/0, + /*length=*/IREE_HAL_WHOLE_BUFFER, + }, + { + /*buffer=*/input_buffer.get(), + /*offset=*/0, + /*length=*/IREE_HAL_WHOLE_BUFFER, + }, + { + /*buffer=*/input_buffer.get(), + /*offset=*/0, + /*length=*/IREE_HAL_WHOLE_BUFFER, + }, + { + /*buffer=*/output_buffer.get(), + /*offset=*/0, + /*length=*/IREE_HAL_WHOLE_BUFFER, + }, }; const iree_hal_buffer_binding_table_t binding_table = { - /*count=*/1, - /*bindings=*/&binding, + /*count=*/IREE_ARRAYSIZE(bindings), + /*bindings=*/bindings, }; IREE_ASSERT_OK(iree_hal_device_queue_execute( test_device.base_device(), IREE_HAL_QUEUE_AFFINITY_ANY,