[HAL/AMDGPU] Track upload ring reclaim positions Extend notification-ring reclaim entries so queue-owned upload bytes can retire through the same completion epoch as kernargs. The new reclaim-position API reports both kernarg and upload ring watermarks, while the existing kernarg-only wrappers remain for current callers and tests that do not care about upload storage. Thread the upload watermark through kernel-shaped host queue submissions, including the failed-submission noop path that plugs already-reserved AQL slots. Host queue drain and teardown now reclaim all queue-owned ring positions through one helper; the upload watermark stays zero until a submission path actually allocates upload spans, so static command-buffer replay remains untouched. Add notification-ring coverage for reporting both queue-owned watermarks across zero-signal epochs.
diff --git a/runtime/src/iree/hal/drivers/amdgpu/host_queue.c b/runtime/src/iree/hal/drivers/amdgpu/host_queue.c index 44d659f..d5dff81 100644 --- a/runtime/src/iree/hal/drivers/amdgpu/host_queue.c +++ b/runtime/src/iree/hal/drivers/amdgpu/host_queue.c
@@ -89,6 +89,23 @@ queue, queue_device_reservation); } +static void iree_hal_amdgpu_host_queue_reclaim_queue_owned_positions( + iree_hal_amdgpu_host_queue_t* queue, + iree_hal_amdgpu_reclaim_positions_t reclaim_positions) { + if (reclaim_positions.kernarg_write_position > 0) { + iree_hal_amdgpu_kernarg_ring_reclaim( + &queue->kernarg_ring, reclaim_positions.kernarg_write_position); + } + if (reclaim_positions.queue_upload_write_position > 0) { + IREE_ASSERT(queue->queue_upload_ring.base, + "queue upload bytes retired without an initialized upload " + "ring"); + iree_hal_amdgpu_queue_upload_ring_reclaim( + &queue->queue_upload_ring, + reclaim_positions.queue_upload_write_position); + } +} + //===----------------------------------------------------------------------===// // Initialization / deinitialization //===----------------------------------------------------------------------===// @@ -141,20 +158,20 @@ &queue->error_status, iree_memory_order_acquire); const uint64_t previous_epoch = (uint64_t)iree_atomic_load( &queue->notification_ring.epoch.last_drained, iree_memory_order_relaxed); - uint64_t kernarg_reclaim_position = 0; + iree_hal_amdgpu_reclaim_positions_t reclaim_positions = {0}; iree_host_size_t count = 0; if (IREE_UNLIKELY(error)) { - count = iree_hal_amdgpu_notification_ring_fail_all( - &queue->notification_ring, error, &kernarg_reclaim_position); + count = iree_hal_amdgpu_notification_ring_fail_all_reclaim_positions( + &queue->notification_ring, error, &reclaim_positions); iree_hal_amdgpu_host_queue_clear_profile_events(queue); iree_async_frontier_tracker_fail_axis( queue->frontier_tracker, queue->axis, iree_status_from_code(iree_status_code(error))); } else { - count = iree_hal_amdgpu_notification_ring_drain( + count = iree_hal_amdgpu_notification_ring_drain_reclaim_positions( &queue->notification_ring, /*fallback_frontier=*/NULL, iree_hal_amdgpu_host_queue_reclaim_retired, - queue, &kernarg_reclaim_position); + queue, &reclaim_positions); const uint64_t current_epoch = (uint64_t)iree_atomic_load(&queue->notification_ring.epoch.last_drained, iree_memory_order_acquire); @@ -163,10 +180,8 @@ current_epoch); } } - if (kernarg_reclaim_position > 0) { - iree_hal_amdgpu_kernarg_ring_reclaim(&queue->kernarg_ring, - kernarg_reclaim_position); - } + iree_hal_amdgpu_host_queue_reclaim_queue_owned_positions(queue, + reclaim_positions); iree_hal_amdgpu_host_queue_run_post_drain_actions(queue); return count; } @@ -539,22 +554,20 @@ // error. Otherwise drain normally (entries completed but not yet processed). iree_status_t error = (iree_status_t)iree_atomic_load( &queue->error_status, iree_memory_order_acquire); - uint64_t kernarg_reclaim_position = 0; + iree_hal_amdgpu_reclaim_positions_t reclaim_positions = {0}; if (!iree_status_is_ok(error)) { - iree_hal_amdgpu_notification_ring_fail_all(&queue->notification_ring, error, - &kernarg_reclaim_position); + iree_hal_amdgpu_notification_ring_fail_all_reclaim_positions( + &queue->notification_ring, error, &reclaim_positions); iree_hal_amdgpu_host_queue_clear_profile_events(queue); iree_status_free(error); } else { - iree_hal_amdgpu_notification_ring_drain( + iree_hal_amdgpu_notification_ring_drain_reclaim_positions( &queue->notification_ring, /*fallback_frontier=*/NULL, iree_hal_amdgpu_host_queue_reclaim_retired, - queue, &kernarg_reclaim_position); + queue, &reclaim_positions); } - if (kernarg_reclaim_position > 0) { - iree_hal_amdgpu_kernarg_ring_reclaim(&queue->kernarg_ring, - kernarg_reclaim_position); - } + iree_hal_amdgpu_host_queue_reclaim_queue_owned_positions(queue, + reclaim_positions); iree_hal_amdgpu_host_queue_run_post_drain_actions(queue); // Deregister from the epoch signal table before destroying the notification @@ -577,6 +590,11 @@ iree_hal_amdgpu_notification_ring_deinitialize(&queue->notification_ring); + if (queue->queue_upload_ring.base) { + iree_hal_amdgpu_queue_upload_ring_deinitialize(queue->libhsa, + &queue->queue_upload_ring); + } + iree_hal_amdgpu_kernarg_ring_deinitialize(queue->libhsa, &queue->kernarg_ring);
diff --git a/runtime/src/iree/hal/drivers/amdgpu/host_queue.h b/runtime/src/iree/hal/drivers/amdgpu/host_queue.h index 9d90c8a..5fbdcd3 100644 --- a/runtime/src/iree/hal/drivers/amdgpu/host_queue.h +++ b/runtime/src/iree/hal/drivers/amdgpu/host_queue.h
@@ -24,6 +24,7 @@ #include "iree/hal/drivers/amdgpu/util/libhsa.h" #include "iree/hal/drivers/amdgpu/util/notification_ring.h" #include "iree/hal/drivers/amdgpu/util/pm4_capabilities.h" +#include "iree/hal/drivers/amdgpu/util/queue_upload_ring.h" #include "iree/hal/drivers/amdgpu/virtual_queue.h" #include "iree/hal/pool.h" #include "iree/hal/profile_schema.h" @@ -172,6 +173,10 @@ // Per-queue kernarg bump allocator backed by HSA kernarg-init memory. iree_hal_amdgpu_kernarg_ring_t kernarg_ring; + // Optional per-queue upload ring for device-visible control records. + // Initialized when a submission path first needs device-side fixup inputs. + iree_hal_amdgpu_queue_upload_ring_t queue_upload_ring; + // Optional per-AQL-slot PM4 IB buffer used by PM4-backed wait, transfer, and // profiling snippets. This is not an independent scheduling ring: each slot // is indexed by the matching AQL packet id and inherits the AQL ring's
diff --git a/runtime/src/iree/hal/drivers/amdgpu/host_queue_submission.c b/runtime/src/iree/hal/drivers/amdgpu/host_queue_submission.c index 7054100..52dc842 100644 --- a/runtime/src/iree/hal/drivers/amdgpu/host_queue_submission.c +++ b/runtime/src/iree/hal/drivers/amdgpu/host_queue_submission.c
@@ -151,8 +151,10 @@ static void iree_hal_amdgpu_host_queue_emit_reclaim_noop_packets( iree_hal_amdgpu_host_queue_t* queue, iree_hal_amdgpu_reclaim_entry_t* reclaim_entry, uint64_t first_packet_id, - uint32_t packet_count, uint64_t kernarg_write_position) { + uint32_t packet_count, uint64_t kernarg_write_position, + uint64_t queue_upload_write_position) { reclaim_entry->kernarg_write_position = kernarg_write_position; + reclaim_entry->queue_upload_write_position = queue_upload_write_position; reclaim_entry->count = 0; iree_hal_amdgpu_notification_ring_advance_epoch(&queue->notification_ring); for (uint32_t i = 0; i < packet_count; ++i) { @@ -547,7 +549,8 @@ iree_hal_amdgpu_host_queue_kernel_submission_t* submission) { iree_hal_amdgpu_host_queue_emit_reclaim_noop_packets( queue, submission->reclaim_entry, submission->first_packet_id, - submission->packet_count, submission->kernarg_write_position); + submission->packet_count, submission->kernarg_write_position, + submission->queue_upload_write_position); memset(submission, 0, sizeof(*submission)); } @@ -855,6 +858,8 @@ } submission->reclaim_entry->kernarg_write_position = submission->kernarg_write_position; + submission->reclaim_entry->queue_upload_write_position = + submission->queue_upload_write_position; submission->reclaim_entry->count = submission->reclaim_resource_count; submission->reclaim_entry->pre_signal_action = submission->pre_signal_action; iree_hal_amdgpu_host_queue_merge_barrier_axes(queue, resolution);
diff --git a/runtime/src/iree/hal/drivers/amdgpu/host_queue_submission.h b/runtime/src/iree/hal/drivers/amdgpu/host_queue_submission.h index 2aebff2..0166543 100644 --- a/runtime/src/iree/hal/drivers/amdgpu/host_queue_submission.h +++ b/runtime/src/iree/hal/drivers/amdgpu/host_queue_submission.h
@@ -68,6 +68,8 @@ uint64_t first_packet_id; // Kernarg ring write position to reclaim after this submission completes. uint64_t kernarg_write_position; + // Queue upload ring write position to reclaim after completion. + uint64_t queue_upload_write_position; // Number of AQL packets reserved starting at |first_packet_id|. uint32_t packet_count; // Number of valid entries in |reclaim_resources|.
diff --git a/runtime/src/iree/hal/drivers/amdgpu/util/notification_ring.c b/runtime/src/iree/hal/drivers/amdgpu/util/notification_ring.c index 831a8de..032de7b 100644 --- a/runtime/src/iree/hal/drivers/amdgpu/util/notification_ring.c +++ b/runtime/src/iree/hal/drivers/amdgpu/util/notification_ring.c
@@ -56,6 +56,7 @@ entry->queue_device_event_count = 0; entry->resource_set = NULL; entry->kernarg_write_position = 0; + entry->queue_upload_write_position = 0; entry->count = 0; if (count <= IREE_HAL_AMDGPU_RECLAIM_INLINE_CAPACITY) { entry->resources = entry->inline_resources; @@ -107,6 +108,7 @@ entry->queue_device_event_count = 0; entry->resource_set = NULL; entry->kernarg_write_position = 0; + entry->queue_upload_write_position = 0; entry->count = 0; } @@ -541,15 +543,15 @@ fallback); } -iree_host_size_t iree_hal_amdgpu_notification_ring_drain( +iree_host_size_t iree_hal_amdgpu_notification_ring_drain_reclaim_positions( iree_hal_amdgpu_notification_ring_t* ring, const iree_async_frontier_t* fallback_frontier, iree_hal_amdgpu_reclaim_retire_fn_t retire_fn, void* retire_user_data, - uint64_t* out_kernarg_reclaim_position) { + iree_hal_amdgpu_reclaim_positions_t* out_reclaim_positions) { IREE_ASSERT_ARGUMENT(ring); - IREE_ASSERT_ARGUMENT(out_kernarg_reclaim_position); + IREE_ASSERT_ARGUMENT(out_reclaim_positions); - *out_kernarg_reclaim_position = 0; + memset(out_reclaim_positions, 0, sizeof(*out_reclaim_positions)); // Early out if the ring was never initialized or already deinitialized. if (!ring->epoch.signal.handle) return 0; @@ -649,6 +651,7 @@ // Release retained resources for all completed epochs. uint64_t highest_kernarg_position = 0; + uint64_t highest_queue_upload_position = 0; for (uint64_t epoch = previous_drained; epoch < current_epoch; ++epoch) { uint32_t reclaim_index = (uint32_t)(epoch & (ring->capacity - 1)); uint64_t kernarg_write_position = @@ -656,6 +659,11 @@ if (kernarg_write_position > highest_kernarg_position) { highest_kernarg_position = kernarg_write_position; } + uint64_t queue_upload_write_position = + ring->reclaim_entries[reclaim_index].queue_upload_write_position; + if (queue_upload_write_position > highest_queue_upload_position) { + highest_queue_upload_position = queue_upload_write_position; + } iree_hal_amdgpu_reclaim_entry_release(&ring->reclaim_entries[reclaim_index], ring->block_pool); } @@ -665,17 +673,34 @@ iree_hal_amdgpu_notification_ring_store_position(&ring->read, read, iree_memory_order_release); - *out_kernarg_reclaim_position = highest_kernarg_position; + out_reclaim_positions->kernarg_write_position = highest_kernarg_position; + out_reclaim_positions->queue_upload_write_position = + highest_queue_upload_position; return drained_count; } -iree_host_size_t iree_hal_amdgpu_notification_ring_fail_all( - iree_hal_amdgpu_notification_ring_t* ring, iree_status_t error_status, +iree_host_size_t iree_hal_amdgpu_notification_ring_drain( + iree_hal_amdgpu_notification_ring_t* ring, + const iree_async_frontier_t* fallback_frontier, + iree_hal_amdgpu_reclaim_retire_fn_t retire_fn, void* retire_user_data, uint64_t* out_kernarg_reclaim_position) { - IREE_ASSERT_ARGUMENT(ring); IREE_ASSERT_ARGUMENT(out_kernarg_reclaim_position); + iree_hal_amdgpu_reclaim_positions_t reclaim_positions = {0}; + iree_host_size_t drained_count = + iree_hal_amdgpu_notification_ring_drain_reclaim_positions( + ring, fallback_frontier, retire_fn, retire_user_data, + &reclaim_positions); + *out_kernarg_reclaim_position = reclaim_positions.kernarg_write_position; + return drained_count; +} - *out_kernarg_reclaim_position = 0; +iree_host_size_t iree_hal_amdgpu_notification_ring_fail_all_reclaim_positions( + iree_hal_amdgpu_notification_ring_t* ring, iree_status_t error_status, + iree_hal_amdgpu_reclaim_positions_t* out_reclaim_positions) { + IREE_ASSERT_ARGUMENT(ring); + IREE_ASSERT_ARGUMENT(out_reclaim_positions); + + memset(out_reclaim_positions, 0, sizeof(*out_reclaim_positions)); iree_host_size_t failed_count = 0; uint64_t read = iree_hal_amdgpu_notification_ring_load_position( @@ -704,6 +729,7 @@ // Release retained resources for all epochs. uint64_t highest_kernarg_position = 0; + uint64_t highest_queue_upload_position = 0; uint64_t last_drained = (uint64_t)iree_atomic_load(&ring->epoch.last_drained, iree_memory_order_relaxed); for (uint64_t epoch = last_drained; epoch < ring->epoch.next_submission; @@ -714,6 +740,11 @@ if (kernarg_write_position > highest_kernarg_position) { highest_kernarg_position = kernarg_write_position; } + uint64_t queue_upload_write_position = + ring->reclaim_entries[reclaim_index].queue_upload_write_position; + if (queue_upload_write_position > highest_queue_upload_position) { + highest_queue_upload_position = queue_upload_write_position; + } iree_hal_amdgpu_reclaim_entry_execute_pre_signal_action( &ring->reclaim_entries[reclaim_index], error_status); iree_hal_amdgpu_reclaim_entry_release(&ring->reclaim_entries[reclaim_index], @@ -726,6 +757,20 @@ iree_hal_amdgpu_notification_ring_store_position(&ring->read, read, iree_memory_order_release); - *out_kernarg_reclaim_position = highest_kernarg_position; + out_reclaim_positions->kernarg_write_position = highest_kernarg_position; + out_reclaim_positions->queue_upload_write_position = + highest_queue_upload_position; + return failed_count; +} + +iree_host_size_t iree_hal_amdgpu_notification_ring_fail_all( + iree_hal_amdgpu_notification_ring_t* ring, iree_status_t error_status, + uint64_t* out_kernarg_reclaim_position) { + IREE_ASSERT_ARGUMENT(out_kernarg_reclaim_position); + iree_hal_amdgpu_reclaim_positions_t reclaim_positions = {0}; + iree_host_size_t failed_count = + iree_hal_amdgpu_notification_ring_fail_all_reclaim_positions( + ring, error_status, &reclaim_positions); + *out_kernarg_reclaim_position = reclaim_positions.kernarg_write_position; return failed_count; }
diff --git a/runtime/src/iree/hal/drivers/amdgpu/util/notification_ring.h b/runtime/src/iree/hal/drivers/amdgpu/util/notification_ring.h index 2a0e593..d02119a 100644 --- a/runtime/src/iree/hal/drivers/amdgpu/util/notification_ring.h +++ b/runtime/src/iree/hal/drivers/amdgpu/util/notification_ring.h
@@ -163,6 +163,14 @@ void* user_data; } iree_hal_amdgpu_reclaim_action_t; +// Queue-owned ring positions retired by one or more completed epochs. +typedef struct iree_hal_amdgpu_reclaim_positions_t { + // Highest kernarg ring write position retired by the completed epochs. + uint64_t kernarg_write_position; + // Highest queue upload ring write position retired by the completed epochs. + uint64_t queue_upload_write_position; +} iree_hal_amdgpu_reclaim_positions_t; + // Optional callback invoked for one completed epoch after pre-signal actions // execute and before user-visible semaphore signals publish. typedef void(IREE_API_PTR* iree_hal_amdgpu_reclaim_retire_fn_t)( @@ -196,6 +204,10 @@ // report the highest position across retired epochs so the caller can reclaim // kernarg blocks. 0 means no kernarg was allocated. uint64_t kernarg_write_position; + // Queue upload ring write position at the time of this submission. + // Drain/fail_all report the highest position across retired epochs so the + // caller can reclaim upload bytes. 0 means no upload bytes were allocated. + uint64_t queue_upload_write_position; // Number of dispatch profiling events reserved by this epoch. uint32_t profile_event_count; // Number of queue device profiling events reserved by this epoch. @@ -212,8 +224,8 @@ // Prepares a reclaim entry for |count| resources. If count fits inline, // sets |*out_resources| to the entry's inline storage. Otherwise acquires // a block from |block_pool| and sets |*out_resources| to point into it. -// The caller fills the array with retained resource pointers, sets -// entry->kernarg_write_position, and sets entry->count before advancing the +// The caller fills the array with retained resource pointers, sets any +// queue-owned ring write positions, and sets entry->count before advancing the // submission epoch. iree_status_t iree_hal_amdgpu_reclaim_entry_prepare( iree_hal_amdgpu_reclaim_entry_t* entry, iree_arena_block_pool_t* block_pool, @@ -401,6 +413,20 @@ // caller has already merged frontier state into the semaphore at submission // time and only needs completion-time timeline advancement/untainting. // +// Stores the highest queue-owned ring positions across all retired epochs in +// |out_reclaim_positions|. Positions are set to 0 if no epochs were retired. +// +// |retire_fn|, when provided, is called once per retired epoch before +// user-visible semaphore publication. It must not publish user-visible +// completion itself. +// +// Returns the number of entries drained. +iree_host_size_t iree_hal_amdgpu_notification_ring_drain_reclaim_positions( + iree_hal_amdgpu_notification_ring_t* ring, + const iree_async_frontier_t* fallback_frontier, + iree_hal_amdgpu_reclaim_retire_fn_t retire_fn, void* retire_user_data, + iree_hal_amdgpu_reclaim_positions_t* out_reclaim_positions); + // Stores the highest kernarg_write_position across all retired epochs in // |out_kernarg_reclaim_position|. Set to 0 if no epochs were retired. // @@ -422,6 +448,14 @@ // // |error_status| is borrowed, not consumed — the caller retains ownership. // +// Stores the highest queue-owned ring positions across all failed entries in +// |out_reclaim_positions| (same semantics as drain_reclaim_positions). +// +// Returns the number of entries failed. +iree_host_size_t iree_hal_amdgpu_notification_ring_fail_all_reclaim_positions( + iree_hal_amdgpu_notification_ring_t* ring, iree_status_t error_status, + iree_hal_amdgpu_reclaim_positions_t* out_reclaim_positions); + // Stores the highest kernarg_write_position across all failed entries in // |out_kernarg_reclaim_position| (same semantics as drain). //
diff --git a/runtime/src/iree/hal/drivers/amdgpu/util/notification_ring_test.cc b/runtime/src/iree/hal/drivers/amdgpu/util/notification_ring_test.cc index 3d166c8..25f5701 100644 --- a/runtime/src/iree/hal/drivers/amdgpu/util/notification_ring_test.cc +++ b/runtime/src/iree/hal/drivers/amdgpu/util/notification_ring_test.cc
@@ -157,10 +157,12 @@ static iree_hal_amdgpu_reclaim_entry_t* ReclaimEntryForNextEpoch( iree_hal_amdgpu_notification_ring_t* ring, - uint64_t kernarg_write_position = 0) { + uint64_t kernarg_write_position = 0, + uint64_t queue_upload_write_position = 0) { iree_hal_amdgpu_reclaim_entry_t* reclaim_entry = iree_hal_amdgpu_notification_ring_reclaim_entry(ring); reclaim_entry->kernarg_write_position = kernarg_write_position; + reclaim_entry->queue_upload_write_position = queue_upload_write_position; return reclaim_entry; } @@ -644,6 +646,29 @@ iree_async_semaphore_release(semaphore); } +TEST_F(NotificationRingTest, QueueOwnedReclaimPositionReporting) { + IREE_ASSERT_OK_AND_ASSIGN(auto ring, InitializeRing()); + + // Epoch 1: no user-visible signals, but both queue-owned rings must retire. + ReclaimEntryForNextEpoch(ring.get(), /*kernarg_write_position=*/64, + /*queue_upload_write_position=*/256); + EXPECT_EQ(iree_hal_amdgpu_notification_ring_advance_epoch(ring.get()), 1u); + + // Epoch 2: later kernargs but an earlier upload watermark. + ReclaimEntryForNextEpoch(ring.get(), /*kernarg_write_position=*/192, + /*queue_upload_write_position=*/128); + EXPECT_EQ(iree_hal_amdgpu_notification_ring_advance_epoch(ring.get()), 2u); + + SimulateCompletions(ring.get(), 2); + iree_hal_amdgpu_reclaim_positions_t reclaim_positions = {0}; + EXPECT_EQ( + iree_hal_amdgpu_notification_ring_drain_reclaim_positions( + ring.get(), &kEmptyFrontier, nullptr, nullptr, &reclaim_positions), + 0u); + EXPECT_EQ(reclaim_positions.kernarg_write_position, 192u); + EXPECT_EQ(reclaim_positions.queue_upload_write_position, 256u); +} + TEST_F(NotificationRingTest, KernargPositionReportingForZeroSignalEpochs) { IREE_ASSERT_OK_AND_ASSIGN(auto ring, InitializeRing());